From 20c897eadf13ee4305af62dbdfe7819307ddf0bc Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Mon, 3 Jul 2023 19:36:41 +0530 Subject: [PATCH 001/489] mm: madvise: fix uneven accounting of psi A folio turns into a Workingset during: 1) shrink_active_list() placing the folio from active to inactive list. 2) When a workingset transition is happening during the folio refault. And when Workingset is set on a folio, PSI for memory can be accounted during a) That folio is being reclaimed and b) Refault of that folio, for usual reclaims. This accounting of PSI for memory is not consistent for reclaim + refault operation between usual reclaim and madvise(COLD/PAGEOUT) which deactivate or proactively reclaim a folio: a) A folio started at inactive and moved to active as part of accesses. Workingset is absent on the folio thus refault of it when reclaimed through MADV_PAGEOUT operation doesn't account for PSI. b) When the same folio transition from inactive->active and then to inactive through shrink_active_list(). Workingset is set on the folio thus refault of it when reclaimed through MADV_PAGEOUT operation accounts for PSI. c) When the same folio is part of active list directly as a result of folio refault and this was a workingset folio prior to eviction. Workingset is set on the folio thus the refault of it when reclaimed through MADV_PAGEOUT/MADV_COLD operation accounts for PSI. d) MADV_COLD transfers the folio from active list to inactive list. Such folios may not have the Workingset thus refault operation on such folio doesn't account for PSI. As said above, refault operation caused because of MADV_PAGEOUT on a folio is accounts for memory PSI in b) and c) but not in a). Refault caused by the reclaim of a folio on which MADV_COLD is performed accounts memory PSI in c) but not in d). These behaviours are inconsistent w.r.t usual reclaim + refault operation. Make this PSI accounting always consistent by turning a folio into a workingset one whenever it is leaving the active list. Also, accounting of PSI on a folio whenever it leaves the active list as part of the MADV_COLD/PAGEOUT operation helps the users whether they are operating on proper folios[1]. [1] https://lore.kernel.org/all/20230605180013.GD221380@cmpxchg.org/ Link: https://lkml.kernel.org/r/1688393201-11135-1-git-send-email-quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Suggested-by: Suren Baghdasaryan Reported-by: Sai Manobhiram Manapragada Reported-by: Pavan Kondeti Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Pavankumar Kondeti Signed-off-by: Andrew Morton --- mm/madvise.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/madvise.c b/mm/madvise.c index 886f06066622f9..05f97038eac3da 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -413,6 +413,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, folio_clear_referenced(folio); folio_test_clear_young(folio); + if (folio_test_active(folio)) + folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) @@ -510,6 +512,8 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, */ folio_clear_referenced(folio); folio_test_clear_young(folio); + if (folio_test_active(folio)) + folio_set_workingset(folio); if (pageout) { if (folio_isolate_lru(folio)) { if (folio_test_unevictable(folio)) From fad9c80e6371ee04a3fa5728efe20b88d8e4cccd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 May 2023 22:51:01 +0200 Subject: [PATCH 002/489] maple_tree: fix a few documentation issues The documentation of mt_next() claims that it starts the search at the provided index. That's incorrect as it starts the search after the provided index. The documentation of mt_find() is slightly confusing. "Handles locking" is not really helpful as it does not explain how the "locking" works. Also the documentation of index talks about a range, while in reality the index is updated on a succesful search to the index of the found entry plus one. Fix similar issues for mt_find_after() and mt_prev(). Reword the confusing "Note: Will not return the zero entry." comment on mt_for_each() and document @__index correctly. Link: https://lkml.kernel.org/r/87ttw2n556.ffs@tglx Signed-off-by: Thomas Gleixner Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Cc: Shanker Donthineni Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 5 +++-- lib/maple_tree.c | 26 +++++++++++++++++++++----- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 295548cca8b369..6e5bd2c9875d64 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -662,10 +662,11 @@ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry - * @__index: The index to update to track the location in the tree + * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * - * Note: Will not return the zero entry. + * This iterator skips all entries, which resolve to a NULL pointer, + * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4dd73cf936a635..f512bb9766aad1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5750,7 +5750,11 @@ EXPORT_SYMBOL_GPL(mas_next_range); * @index: The start index * @max: The maximum index to check * - * Return: The entry at @index or higher, or %NULL if nothing is found. + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst + * + * Return: The entry higher than @index or %NULL if nothing is found. */ void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max) { @@ -5856,7 +5860,11 @@ EXPORT_SYMBOL_GPL(mas_prev_range); * @index: The start index * @min: The minimum index to check * - * Return: The entry at @index or lower, or %NULL if nothing is found. + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst + * + * Return: The entry before @index or %NULL if nothing is found. */ void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min) { @@ -6468,9 +6476,15 @@ EXPORT_SYMBOL(mtree_destroy); * mt_find() - Search from the start up until an entry is found. * @mt: The maple tree * @index: Pointer which contains the start location of the search - * @max: The maximum value to check + * @max: The maximum value of the search range + * + * Takes RCU read lock internally to protect the search, which does not + * protect the returned pointer after dropping RCU read lock. + * See also: Documentation/core-api/maple_tree.rst * - * Handles locking. @index will be incremented to one beyond the range. + * In case that an entry is found @index is updated to point to the next + * possible entry independent whether the found entry is occupying a + * single index or a range if indices. * * Return: The entry at or after the @index or %NULL */ @@ -6528,7 +6542,9 @@ EXPORT_SYMBOL(mt_find); * @index: Pointer which contains the start location of the search * @max: The maximum value to check * - * Handles locking, detects wrapping on index == 0 + * Same as mt_find() except that it checks @index for 0 before + * searching. If @index == 0, the search is aborted. This covers a wrap + * around of @index to 0 in an iterator loop. * * Return: The entry at or after the @index or %NULL */ From 3a29280afb25263c76212a8c140c29f280049ffb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 25 Jun 2023 11:33:40 +0800 Subject: [PATCH 003/489] mm/mm_init.c: update obsolete comment in get_pfn_range_for_nid() Since commit 633c0666b5a5 ("Memoryless nodes: drop one memoryless node boot warning"), the warning for a node with no available memory is removed. Update the corresponding comment. Link: https://lkml.kernel.org/r/20230625033340.1054103-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index a1963c3322af43..d356ba59ef2a3b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1681,8 +1681,7 @@ static inline void alloc_node_mem_map(struct pglist_data *pgdat) { } * * It returns the start and end page frame of a node based on information * provided by memblock_set_node(). If called for a node - * with no available memory, a warning is printed and the start and end - * PFNs will be 0. + * with no available memory, the start and end PFNs will be 0. */ void __init get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn) From 87b11f862254396a93636f0998377ac3f6648f5f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 27 Jun 2023 10:43:49 -0700 Subject: [PATCH 004/489] mm: increase usage of folio_next_index() helper Simplify code pattern of 'folio->index + folio_nr_pages(folio)' by using the existing helper folio_next_index(). Link: https://lkml.kernel.org/r/20230627174349.491803-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Andreas Dilger Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/inode.c | 4 ++-- mm/filemap.c | 8 ++++---- mm/memory.c | 2 +- mm/shmem.c | 2 +- mm/truncate.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca5054a..3d253e250871c0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1569,7 +1569,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (folio->index < mpd->first_page) continue; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); @@ -2455,7 +2455,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) if (mpd->map.m_len == 0) mpd->first_page = folio->index; - mpd->next_page = folio->index + folio_nr_pages(folio); + mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we diff --git a/mm/filemap.c b/mm/filemap.c index 9e44a49bbd74d7..c5e2c70ea04687 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2075,7 +2075,7 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start, if (!xa_is_value(folio)) { if (folio->index < *start) goto put; - if (folio->index + folio_nr_pages(folio) - 1 > end) + if (folio_next_index(folio) - 1 > end) goto put; if (!folio_trylock(folio)) goto put; @@ -2174,7 +2174,7 @@ bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) return false; if (index >= max) return false; - return index < folio->index + folio_nr_pages(folio) - 1; + return index < folio_next_index(folio) - 1; } /** @@ -2242,7 +2242,7 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (folio_test_hugetlb(folio)) *start = folio->index + 1; else - *start = folio->index + folio_nr_pages(folio); + *start = folio_next_index(folio); } out: rcu_read_unlock(); @@ -2359,7 +2359,7 @@ static void filemap_get_read_batch(struct address_space *mapping, break; if (folio_test_readahead(folio)) break; - xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1); + xas_advance(&xas, folio_next_index(folio) - 1); continue; put_folio: folio_put(folio); diff --git a/mm/memory.c b/mm/memory.c index 603b2f41994831..33f0f28c7ebc63 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3495,7 +3495,7 @@ void unmap_mapping_folio(struct folio *folio) VM_BUG_ON(!folio_test_locked(folio)); first_index = folio->index; - last_index = folio->index + folio_nr_pages(folio) - 1; + last_index = folio_next_index(folio) - 1; details.even_cows = false; details.single_folio = folio; diff --git a/mm/shmem.c b/mm/shmem.c index f5af4b943e4286..8dfd72bdc86ab8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -970,7 +970,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, same_folio = lend < folio_pos(folio) + folio_size(folio); folio_mark_dirty(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { - start = folio->index + folio_nr_pages(folio); + start = folio_next_index(folio); if (same_folio) end = folio->index; } diff --git a/mm/truncate.c b/mm/truncate.c index 95d1291d269b57..2f28cc0e12ef1d 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -378,7 +378,7 @@ void truncate_inode_pages_range(struct address_space *mapping, if (!IS_ERR(folio)) { same_folio = lend < folio_pos(folio) + folio_size(folio); if (!truncate_inode_partial_folio(folio, lstart, lend)) { - start = folio->index + folio_nr_pages(folio); + start = folio_next_index(folio); if (same_folio) end = folio->index; } From 67490031e83a008c5ce8f562e7fa3b6b83adc861 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Tue, 27 Jun 2023 20:08:32 +0800 Subject: [PATCH 005/489] swap: cleanup duplicated WARN_ON in add_to_avail_list Patch series "fix WARN_ON in add_to_avail_list". Empty check for plist_node is checked in add_to_avail_list and plist_add. Drop the duplicate one in add_to_avail_list. Link: https://lkml.kernel.org/r/20230627120833.2230766-1-mawupeng1@huawei.com Link: https://lkml.kernel.org/r/20230627120833.2230766-2-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 8e6dde68b38904..2a469364207172 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -714,10 +714,8 @@ static void add_to_avail_list(struct swap_info_struct *p) int nid; spin_lock(&swap_avail_lock); - for_each_node(nid) { - WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + for_each_node(nid) plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); - } spin_unlock(&swap_avail_lock); } From c70699e555537b611f4cb426c26f8ab4a264a8a0 Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Tue, 27 Jun 2023 20:08:33 +0800 Subject: [PATCH 006/489] swap: stop add to avail list if swap is full Our test finds a WARN_ON in add_to_avail_list. During add_to_avail_list, avail_lists is already in swap_avail_heads, while leads to this WARN_ON. Here is the simplified calltrace: ------------[ cut here ]------------ Call trace: add_to_avail_list+0xb8/0xc0 swap_range_free+0x110/0x138 swapcache_free_entries+0x100/0x1c0 free_swap_slot+0xbc/0xe0 put_swap_folio+0x1f0/0x2ec delete_from_swap_cache+0x6c/0xd0 folio_free_swap+0xa4/0xe4 __try_to_reclaim_swap+0x9c/0x190 free_swap_and_cache+0x84/0x88 unmap_page_range+0x31c/0x934 unmap_single_vma.isra.0+0x48/0x84 unmap_vmas+0x98/0x10c exit_mmap+0xa4/0x210 mmput+0x88/0x158 do_exit+0x284/0x970 do_group_exit+0x34/0x90 post_copy_siginfo_from_user32+0x0/0x1cc do_notify_resume+0x15c/0x470 el0_svc+0x74/0x84 el0t_64_sync_handler+0xb8/0xbc el0t_64_sync+0x190/0x194 During swapoff, try_to_unuse fails to alloc memory due to memory limit and this leads to the failure of swapoff and causes re-insertion of swap space back into swap_list. During _enable_swap_info, this swap device is added to avail list even this swap device if full. At the same time, one entry in this full swap device in released and we try to add this device into avail list and find it is already in the avail list. This causes this WARN_ON. To fix this. Don't add to avail list is swap is full. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20230627120833.2230766-3-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/swapfile.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 2a469364207172..cad0209ac67f62 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2328,7 +2328,10 @@ static void _enable_swap_info(struct swap_info_struct *p) * swap_info_struct. */ plist_add(&p->list, &swap_active_head); - add_to_avail_list(p); + + /* add to available list iff swap device is not full */ + if (p->highest_bit) + add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, From 15b4919a1e0703b77dd7cc0a4d9732f7f6181236 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 1 Jul 2023 11:28:52 +0800 Subject: [PATCH 007/489] mm: use a folio in fault_dirty_shared_page() We can replace four implicit calls to compound_head() with one by using folio. Link: https://lkml.kernel.org/r/20230701032853.258697-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 33f0f28c7ebc63..e9f9944c7370cc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2965,20 +2965,20 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; - dirtied = set_page_dirty(page); - VM_BUG_ON_PAGE(PageAnon(page), page); + dirtied = folio_mark_dirty(folio); + VM_BUG_ON_FOLIO(folio_test_anon(folio), folio); /* - * Take a local copy of the address_space - page.mapping may be zeroed - * by truncate after unlock_page(). The address_space itself remains - * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * Take a local copy of the address_space - folio.mapping may be zeroed + * by truncate after folio_unlock(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on folio_unlock()'s * release semantics to prevent the compiler from undoing this copying. */ - mapping = page_rmapping(page); - unlock_page(page); + mapping = folio_raw_mapping(folio); + folio_unlock(folio); if (!page_mkwrite) file_update_time(vma->vm_file); From fc1878ec70ede56ee48f2d65525d4f7c6888b496 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 1 Jul 2023 11:28:53 +0800 Subject: [PATCH 008/489] mm: remove page_rmapping() After converting the last user to folio_raw_mapping(), we can safely remove the function. Link: https://lkml.kernel.org/r/20230701032853.258697-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/util.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 406ab9ea818fea..6d150990e35c10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2170,7 +2170,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern void *page_rmapping(struct page *page); extern pgoff_t __page_file_index(struct page *page); /* diff --git a/mm/util.c b/mm/util.c index dd12b9531ac4ca..5e9305189c3fdc 100644 --- a/mm/util.c +++ b/mm/util.c @@ -734,12 +734,6 @@ void *vcalloc(size_t n, size_t size) } EXPORT_SYMBOL(vcalloc); -/* Neutral page->mapping pointer to address_space or anon_vma or other */ -void *page_rmapping(struct page *page) -{ - return folio_raw_mapping(page_folio(page)); -} - struct anon_vma *folio_anon_vma(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; From 626e98cb0366e66bdc2088918aecabee1fc6c4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Fri, 30 Jun 2023 11:08:53 +0200 Subject: [PATCH 009/489] mm: make MEMFD_CREATE into a selectable config option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memfd_create() syscall, enabled by CONFIG_MEMFD_CREATE, is useful on its own even when not required by CONFIG_TMPFS or CONFIG_HUGETLBFS. Split it into its own proper bool option that can be enabled by users. Move that option into mm/ where the code itself also lies. Also add "select" statements to CONFIG_TMPFS and CONFIG_HUGETLBFS so they automatically enable CONFIG_MEMFD_CREATE as before. Link: https://lkml.kernel.org/r/20230630-config-memfd-v1-1-9acc3ae38b5a@weissschuh.net Signed-off-by: Thomas Weißschuh Tested-by: Zhangjin Wu Cc: Al Viro Cc: Christian Brauner Cc: Darrick J. Wong Signed-off-by: Andrew Morton --- fs/Kconfig | 5 ++--- mm/Kconfig | 3 +++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec79539f..19975b104bc36d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -169,6 +169,7 @@ source "fs/sysfs/Kconfig" config TMPFS bool "Tmpfs virtual memory file system support (former shm fs)" depends on SHMEM + select MEMFD_CREATE help Tmpfs is a file system which keeps all files in virtual memory. @@ -240,6 +241,7 @@ config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN depends on (SYSFS || SYSCTL) + select MEMFD_CREATE help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -264,9 +266,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off (boot command line) or hugetlb_optimize_vmemmap (sysctl). -config MEMFD_CREATE - def_bool TMPFS || HUGETLBFS - config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/mm/Kconfig b/mm/Kconfig index 09130434e30d38..22acffd9009dfd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1144,6 +1144,9 @@ config KMAP_LOCAL_NON_LINEAR_PTE_ARRAY config IO_MAPPING bool +config MEMFD_CREATE + bool "Enable memfd_create() system call" if EXPERT + config SECRETMEM default y bool "Enable memfd_secret() system call" if EXPERT From 527ed4f7d902d362471a93e1a4afb604c18ceb48 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:52 +0800 Subject: [PATCH 010/489] mm: remove arguments of show_mem() All callers of show_mem() pass 0 and NULL, so we can remove the two arguments by directly calling __show_mem(0, NULL, MAX_NR_ZONES - 1) in show_mem(). Link: https://lkml.kernel.org/r/20230630062253.189440-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/powerpc/xmon/xmon.c | 2 +- drivers/tty/sysrq.c | 2 +- drivers/tty/vt/keyboard.c | 2 +- include/linux/mm.h | 4 ++-- init/initramfs.c | 2 +- kernel/panic.c | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index fae747cc57d2dd..ee17270d35d0e4 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1084,7 +1084,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(0, NULL); + show_mem(); break; default: termch = cmd; diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index b6e70c5cfa1742..e1df63a88aac8f 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -342,7 +342,7 @@ static const struct sysrq_key_op sysrq_ftrace_dump_op = { static void sysrq_handle_showmem(int key) { - show_mem(0, NULL); + show_mem(); } static const struct sysrq_key_op sysrq_showmem_op = { .handler = sysrq_handle_showmem, diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index be8313cdbac360..358f216c6cd6ee 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -606,7 +606,7 @@ static void fn_scroll_back(struct vc_data *vc) static void fn_show_mem(struct vc_data *vc) { - show_mem(0, NULL); + show_mem(); } static void fn_show_state(struct vc_data *vc) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6d150990e35c10..8c3e3eec9008be 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3068,9 +3068,9 @@ extern void mem_init(void); extern void __init mmap_init(void); extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); -static inline void show_mem(unsigned int flags, nodemask_t *nodemask) +static inline void show_mem(void) { - __show_mem(flags, nodemask, MAX_NR_ZONES - 1); + __show_mem(0, NULL, MAX_NR_ZONES - 1); } extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); diff --git a/init/initramfs.c b/init/initramfs.c index e7a01c2ccd1b0c..8d0fd946cdd2b3 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -61,7 +61,7 @@ static void __init error(char *x) } #define panic_show_mem(fmt, ...) \ - ({ show_mem(0, NULL); panic(fmt, ##__VA_ARGS__); }) + ({ show_mem(); panic(fmt, ##__VA_ARGS__); }) /* link hash */ diff --git a/kernel/panic.c b/kernel/panic.c index 10effe40a3fa65..07239d4ad81e87 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -216,7 +216,7 @@ static void panic_print_sys_info(bool console_flush) show_state(); if (panic_print & PANIC_PRINT_MEM_INFO) - show_mem(0, NULL); + show_mem(); if (panic_print & PANIC_PRINT_TIMER_INFO) sysrq_timer_list_show(); From 1279aa0656bbebbeecbd3bec0dd50faf35e5db38 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 30 Jun 2023 14:22:53 +0800 Subject: [PATCH 011/489] mm: make show_free_areas() static All callers of show_free_areas() pass 0 and NULL, so we can directly use show_mem() instead of show_free_areas(0, NULL), which could make show_free_areas() a static function. Link: https://lkml.kernel.org/r/20230630062253.189440-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Christophe Leroy Cc: Greg Kroah-Hartman Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- arch/sparc/kernel/setup_32.c | 2 +- include/linux/mm.h | 12 ------------ mm/internal.h | 6 ++++++ mm/nommu.c | 8 ++++---- mm/show_mem.c | 4 ++-- 5 files changed, 13 insertions(+), 19 deletions(-) diff --git a/arch/sparc/kernel/setup_32.c b/arch/sparc/kernel/setup_32.c index 1adf5c1c16b856..34ef7febf0d562 100644 --- a/arch/sparc/kernel/setup_32.c +++ b/arch/sparc/kernel/setup_32.c @@ -83,7 +83,7 @@ static void prom_sync_me(void) "nop\n\t" : : "r" (&trapbase)); prom_printf("PROM SYNC COMMAND...\n"); - show_free_areas(0, NULL); + show_mem(); if (!is_idle_task(current)) { local_irq_enable(); ksys_sync(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8c3e3eec9008be..d1ad22980ebe7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2236,18 +2236,6 @@ extern void pagefault_out_of_memory(void); #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) #define offset_in_folio(folio, p) ((unsigned long)(p) & (folio_size(folio) - 1)) -/* - * Flags passed to show_mem() and show_free_areas() to suppress output in - * various contexts. - */ -#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ - -extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); -static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) -{ - __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); -} - /* * Parameter block passed down to zap_pte_range in exceptional cases. */ diff --git a/mm/internal.h b/mm/internal.h index a7d9e980429a55..721ed07d7fd6f1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -61,6 +61,12 @@ void page_writeback_init(void); #define COMPOUND_MAPPED 0x800000 #define FOLIO_PAGES_MAPPED (COMPOUND_MAPPED - 1) +/* + * Flags passed to __show_mem() and show_free_areas() to suppress output in + * various contexts. + */ +#define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ + /* * How many individual pages have an elevated _mapcount. Excludes * the folio's entire_mapcount. diff --git a/mm/nommu.c b/mm/nommu.c index c072a660ec2cfb..9826f6101a05c0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1003,7 +1003,7 @@ static int do_mmap_private(struct vm_area_struct *vma, enomem: pr_err("Allocation of length %lu from process %d (%s) failed\n", len, current->pid, current->comm); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; } @@ -1236,20 +1236,20 @@ unsigned long do_mmap(struct file *file, kmem_cache_free(vm_region_jar, region); pr_warn("Allocation of vma for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; error_getting_region: pr_warn("Allocation of vm region for %lu byte allocation from process %d failed\n", len, current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; error_vma_iter_prealloc: kmem_cache_free(vm_region_jar, region); vm_area_free(vma); pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - show_free_areas(0, NULL); + show_mem(); return -ENOMEM; } diff --git a/mm/show_mem.c b/mm/show_mem.c index 01f8e990581734..09c7d036d49ecb 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -186,7 +186,7 @@ static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; @@ -406,7 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) struct zone *zone; printk("Mem-Info:\n"); - __show_free_areas(filter, nodemask, max_zone_idx); + show_free_areas(filter, nodemask, max_zone_idx); for_each_populated_zone(zone) { From b53e24c4f6bcc5f38b79c1c344e643bd54b69964 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 22 May 2023 17:43:09 -0700 Subject: [PATCH 012/489] mm: call arch_swap_restore() from unuse_pte() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We would like to move away from requiring architectures to restore metadata from swap in the set_pte_at() implementation, as this is not only error-prone but adds complexity to the arch-specific code. This requires us to call arch_swap_restore() before calling swap_free() whenever pages are restored from swap. We are currently doing so everywhere except in unuse_pte(); do so there as well. Link: https://lkml.kernel.org/r/20230523004312.1807357-3-pcc@google.com Link: https://linux-review.googlesource.com/id/I68276653e612d64cde271ce1b5a99ae05d6bbc4f Signed-off-by: Peter Collingbourne Suggested-by: David Hildenbrand Acked-by: David Hildenbrand Acked-by: "Huang, Ying" Reviewed-by: Steven Price Acked-by: Catalin Marinas Cc: Alexandru Elisei Cc: Chinwen Chang Cc: Evgenii Stepanov Cc: Greg Kroah-Hartman Cc: kasan-dev Cc: "Kuan-Ying Lee (李冠穎)" Cc: Qun-Wei Lin Cc: Suren Baghdasaryan Cc: Vincenzo Frascino Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/swapfile.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/swapfile.c b/mm/swapfile.c index cad0209ac67f62..d996c335fc3c2a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1778,6 +1778,13 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto setpte; } + /* + * Some architectures may have to restore extra metadata to the page + * when reading from swap. This metadata may be indexed by swap entry + * so this must be called before swap_free(). + */ + arch_swap_restore(entry, page_folio(page)); + /* See do_swap_page() */ BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); BUG_ON(PageAnon(page) && PageAnonExclusive(page)); From 332c151c710ad404e6e67eba7ae899ad8333333f Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 22 May 2023 17:43:10 -0700 Subject: [PATCH 013/489] arm64: mte: simplify swap tag restoration logic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a result of the patches "mm: Call arch_swap_restore() from do_swap_page()" and "mm: Call arch_swap_restore() from unuse_pte()", there are no circumstances in which a swapped-in page is installed in a page table without first having arch_swap_restore() called on it. Therefore, we no longer need the logic in set_pte_at() that restores the tags, so remove it. Link: https://lkml.kernel.org/r/20230523004312.1807357-4-pcc@google.com Link: https://linux-review.googlesource.com/id/I8ad54476f3b2d0144ccd8ce0c1d7a2963e5ff6f3 Signed-off-by: Peter Collingbourne Reviewed-by: Steven Price Reviewed-by: Catalin Marinas Cc: Alexandru Elisei Cc: Chinwen Chang Cc: David Hildenbrand Cc: Evgenii Stepanov Cc: Greg Kroah-Hartman Cc: kasan-dev@googlegroups.com Cc: kasan-dev Cc: "Kuan-Ying Lee (李冠穎)" Cc: Qun-Wei Lin Cc: Suren Baghdasaryan Cc: Vincenzo Frascino Cc: Will Deacon Cc: "Huang, Ying" Signed-off-by: Andrew Morton --- arch/arm64/include/asm/mte.h | 4 ++-- arch/arm64/include/asm/pgtable.h | 14 ++---------- arch/arm64/kernel/mte.c | 37 ++++++-------------------------- 3 files changed, 11 insertions(+), 44 deletions(-) diff --git a/arch/arm64/include/asm/mte.h b/arch/arm64/include/asm/mte.h index c028afb1cd0bd3..4cedbaa16f4196 100644 --- a/arch/arm64/include/asm/mte.h +++ b/arch/arm64/include/asm/mte.h @@ -90,7 +90,7 @@ static inline bool try_page_mte_tagging(struct page *page) } void mte_zero_clear_page_tags(void *addr); -void mte_sync_tags(pte_t old_pte, pte_t pte); +void mte_sync_tags(pte_t pte); void mte_copy_page_tags(void *kto, const void *kfrom); void mte_thread_init_user(void); void mte_thread_switch(struct task_struct *next); @@ -122,7 +122,7 @@ static inline bool try_page_mte_tagging(struct page *page) static inline void mte_zero_clear_page_tags(void *addr) { } -static inline void mte_sync_tags(pte_t old_pte, pte_t pte) +static inline void mte_sync_tags(pte_t pte) { } static inline void mte_copy_page_tags(void *kto, const void *kfrom) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0bd18de9fd97b0..e8a252e62b1265 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -337,18 +337,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, * don't expose tags (instruction fetches don't check tags). */ if (system_supports_mte() && pte_access_permitted(pte, false) && - !pte_special(pte)) { - pte_t old_pte = READ_ONCE(*ptep); - /* - * We only need to synchronise if the new PTE has tags enabled - * or if swapping in (in which case another mapping may have - * set tags in the past even if this PTE isn't tagged). - * (!pte_none() && !pte_present()) is an open coded version of - * is_swap_pte() - */ - if (pte_tagged(pte) || (!pte_none(old_pte) && !pte_present(old_pte))) - mte_sync_tags(old_pte, pte); - } + !pte_special(pte) && pte_tagged(pte)) + mte_sync_tags(pte); __check_safe_pte_update(mm, ptep, pte); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 4c5ef9b2006500..4edecaac8f919a 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -35,41 +35,18 @@ DEFINE_STATIC_KEY_FALSE(mte_async_or_asymm_mode); EXPORT_SYMBOL_GPL(mte_async_or_asymm_mode); #endif -static void mte_sync_page_tags(struct page *page, pte_t old_pte, - bool check_swap, bool pte_is_tagged) -{ - if (check_swap && is_swap_pte(old_pte)) { - swp_entry_t entry = pte_to_swp_entry(old_pte); - - if (!non_swap_entry(entry)) - mte_restore_tags(entry, page); - } - - if (!pte_is_tagged) - return; - - if (try_page_mte_tagging(page)) { - mte_clear_page_tags(page_address(page)); - set_page_mte_tagged(page); - } -} - -void mte_sync_tags(pte_t old_pte, pte_t pte) +void mte_sync_tags(pte_t pte) { struct page *page = pte_page(pte); long i, nr_pages = compound_nr(page); - bool check_swap = nr_pages == 1; - bool pte_is_tagged = pte_tagged(pte); - - /* Early out if there's nothing to do */ - if (!check_swap && !pte_is_tagged) - return; /* if PG_mte_tagged is set, tags have already been initialised */ - for (i = 0; i < nr_pages; i++, page++) - if (!page_mte_tagged(page)) - mte_sync_page_tags(page, old_pte, check_swap, - pte_is_tagged); + for (i = 0; i < nr_pages; i++, page++) { + if (try_page_mte_tagging(page)) { + mte_clear_page_tags(page_address(page)); + set_page_mte_tagged(page); + } + } /* ensure the tags are visible before the PTE is set */ smp_wmb(); From dd767aaa2fc8f1a000df0504f6231afcafe8a8e9 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:03 -0400 Subject: [PATCH 014/489] mm/hugetlb: handle FOLL_DUMP well in follow_page_mask() Patch series "mm/gup: Unify hugetlb, speed up thp", v4. Hugetlb has a special path for slow gup that follow_page_mask() is actually skipped completely along with faultin_page(). It's not only confusing, but also duplicating a lot of logics that generic gup already has, making hugetlb slightly special. This patchset tries to dedup the logic, by first touching up the slow gup code to be able to handle hugetlb pages correctly with the current follow page and faultin routines (where we're mostly there.. due to 10 years ago we did try to optimize thp, but half way done; more below), then at the last patch drop the special path, then the hugetlb gup will always go the generic routine too via faultin_page(). Note that hugetlb is still special for gup, mostly due to the pgtable walking (hugetlb_walk()) that we rely on which is currently per-arch. But this is still one small step forward, and the diffstat might be a proof too that this might be worthwhile. Then for the "speed up thp" side: as a side effect, when I'm looking at the chunk of code, I found that thp support is actually partially done. It doesn't mean that thp won't work for gup, but as long as **pages pointer passed over, the optimization will be skipped too. Patch 6 should address that, so for thp we now get full speed gup. For a quick number, "chrt -f 1 ./gup_test -m 512 -t -L -n 1024 -r 10" gives me 13992.50us -> 378.50us. Gup_test is an extreme case, but just to show how it affects thp gups. This patch (of 8): Firstly, the no_page_table() is meaningless for hugetlb which is a no-op there, because a hugetlb page always satisfies: - vma_is_anonymous() == false - vma->vm_ops->fault != NULL So we can already safely remove it in hugetlb_follow_page_mask(), alongside with the page* variable. Meanwhile, what we do in follow_hugetlb_page() actually makes sense for a dump: we try to fault in the page only if the page cache is already allocated. Let's do the same here for follow_page_mask() on hugetlb. It should so far has zero effect on real dumps, because that still goes into follow_hugetlb_page(). But this may start to influence a bit on follow_page() users who mimics a "dump page" scenario, but hopefully in a good way. This also paves way for unifying the hugetlb gup-slow. Link: https://lkml.kernel.org/r/20230628215310.73782-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20230628215310.73782-2-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 9 ++------- mm/hugetlb.c | 9 +++++++++ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 76d222ccc3ff4f..9c62cfa7e486e0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -811,7 +811,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, struct follow_page_context *ctx) { pgd_t *pgd; - struct page *page; struct mm_struct *mm = vma->vm_mm; ctx->page_mask = 0; @@ -824,12 +823,8 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, * hugetlb_follow_page_mask is only for follow_page() handling here. * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. */ - if (is_vm_hugetlb_page(vma)) { - page = hugetlb_follow_page_mask(vma, address, flags); - if (!page) - page = no_page_table(vma, flags); - return page; - } + if (is_vm_hugetlb_page(vma)) + return hugetlb_follow_page_mask(vma, address, flags); pgd = pgd_offset(mm, address); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 64a3239b6407e9..4fb396dd65bdcf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6498,6 +6498,15 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, spin_unlock(ptl); out_unlock: hugetlb_vma_unlock_read(vma); + + /* + * Fixup retval for dump requests: if pagecache doesn't exist, + * don't try to allocate a new page but just skip it. + */ + if (!page && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, address)) + page = ERR_PTR(-EFAULT); + return page; } From 458568c92953dee3716234711f1a2830a35261f3 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:04 -0400 Subject: [PATCH 015/489] mm/hugetlb: prepare hugetlb_follow_page_mask() for FOLL_PIN follow_page() doesn't use FOLL_PIN, meanwhile hugetlb seems to not be the target of FOLL_WRITE either. However add the checks. Namely, either the need to CoW due to missing write bit, or proper unsharing on !AnonExclusive pages over R/O pins to reject the follow page. That brings this function closer to follow_hugetlb_page(). So we don't care before, and also for now. But we'll care if we switch over slow-gup to use hugetlb_follow_page_mask(). We'll also care when to return -EMLINK properly, as that's the gup internal api to mean "we should unshare". Not really needed for follow page path, though. When at it, switching the try_grab_page() to use WARN_ON_ONCE(), to be clear that it just should never fail. When error happens, instead of setting page==NULL, capture the errno instead. Link: https://lkml.kernel.org/r/20230628215310.73782-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/hugetlb.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4fb396dd65bdcf..cc87a51ce71a22 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6462,13 +6462,7 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; pte_t *pte, entry; - - /* - * FOLL_PIN is not supported for follow_page(). Ordinary GUP goes via - * follow_hugetlb_page(). - */ - if (WARN_ON_ONCE(flags & FOLL_PIN)) - return NULL; + int ret; hugetlb_vma_lock_read(vma); pte = hugetlb_walk(vma, haddr, huge_page_size(h)); @@ -6478,8 +6472,23 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, ptl = huge_pte_lock(h, mm, pte); entry = huge_ptep_get(pte); if (pte_present(entry)) { - page = pte_page(entry) + - ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + page = pte_page(entry); + + if (!huge_pte_write(entry)) { + if (flags & FOLL_WRITE) { + page = NULL; + goto out; + } + + if (gup_must_unshare(vma, flags, page)) { + /* Tell the caller to do unsharing */ + page = ERR_PTR(-EMLINK); + goto out; + } + } + + page += ((address & ~huge_page_mask(h)) >> PAGE_SHIFT); + /* * Note that page may be a sub-page, and with vmemmap * optimizations the page struct may be read only. @@ -6489,8 +6498,10 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, * try_grab_page() should always be able to get the page here, * because we hold the ptl lock and have verified pte_present(). */ - if (try_grab_page(page, flags)) { - page = NULL; + ret = try_grab_page(page, flags); + + if (WARN_ON_ONCE(ret)) { + page = ERR_PTR(ret); goto out; } } From 5502ea44f5ade35d32a397353956bc026b870400 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:05 -0400 Subject: [PATCH 016/489] mm/hugetlb: add page_mask for hugetlb_follow_page_mask() follow_page() doesn't need it, but we'll start to need it when unifying gup for hugetlb. Link: https://lkml.kernel.org/r/20230628215310.73782-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 8 +++++--- mm/gup.c | 3 ++- mm/hugetlb.c | 5 ++++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ca3c8e10f24a08..9f282f370d964f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -131,7 +131,8 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *, struct vm_area_struct *); struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags); + unsigned long address, unsigned int flags, + unsigned int *page_mask); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, unsigned long *, unsigned long *, long, unsigned int, int *); @@ -297,8 +298,9 @@ static inline void adjust_range_if_pmd_sharing_possible( { } -static inline struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) +static inline struct page *hugetlb_follow_page_mask( + struct vm_area_struct *vma, unsigned long address, unsigned int flags, + unsigned int *page_mask) { BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ } diff --git a/mm/gup.c b/mm/gup.c index 9c62cfa7e486e0..818d98b34decc7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -824,7 +824,8 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. */ if (is_vm_hugetlb_page(vma)) - return hugetlb_follow_page_mask(vma, address, flags); + return hugetlb_follow_page_mask(vma, address, flags, + &ctx->page_mask); pgd = pgd_offset(mm, address); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cc87a51ce71a22..ab52214b5a75e9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6454,7 +6454,8 @@ static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, } struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags) + unsigned long address, unsigned int flags, + unsigned int *page_mask) { struct hstate *h = hstate_vma(vma); struct mm_struct *mm = vma->vm_mm; @@ -6504,6 +6505,8 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, page = ERR_PTR(ret); goto out; } + + *page_mask = (1U << huge_page_order(h)) - 1; } out: spin_unlock(ptl); From ffe1e7861211aafe12977a3ed2f11bb6fe1e77ea Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:06 -0400 Subject: [PATCH 017/489] mm/gup: cleanup next_page handling The only path that doesn't use generic "**pages" handling is the gate vma. Make it use the same path, meanwhile tune the next_page label upper to cover "**pages" handling. This prepares for THP handling for "**pages". Link: https://lkml.kernel.org/r/20230628215310.73782-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 818d98b34decc7..d70f8f0613f4cc 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1207,7 +1207,7 @@ static long __get_user_pages(struct mm_struct *mm, if (!vma && in_gate_area(mm, start)) { ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, &vma, - pages ? &pages[i] : NULL); + pages ? &page : NULL); if (ret) goto out; ctx.page_mask = 0; @@ -1277,19 +1277,18 @@ static long __get_user_pages(struct mm_struct *mm, ret = PTR_ERR(page); goto out; } - - goto next_page; } else if (IS_ERR(page)) { ret = PTR_ERR(page); goto out; } +next_page: if (pages) { pages[i] = page; flush_anon_page(vma, page, start); flush_dcache_page(page); ctx.page_mask = 0; } -next_page: + page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; From 57edfcfd3419b4799353d8cbd6ce49da075cfdbd Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:07 -0400 Subject: [PATCH 018/489] mm/gup: accelerate thp gup even for "pages != NULL" The acceleration of THP was done with ctx.page_mask, however it'll be ignored if **pages is non-NULL. The old optimization was introduced in 2013 in 240aadeedc4a ("mm: accelerate mm_populate() treatment of THP pages"). It didn't explain why we can't optimize the **pages non-NULL case. It's possible that at that time the major goal was for mm_populate() which should be enough back then. Optimize thp for all cases, by properly looping over each subpage, doing cache flushes, and boost refcounts / pincounts where needed in one go. This can be verified using gup_test below: # chrt -f 1 ./gup_test -m 512 -t -L -n 1024 -r 10 Before: 13992.50 ( +-8.75%) After: 378.50 (+-69.62%) Link: https://lkml.kernel.org/r/20230628215310.73782-6-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Lorenzo Stoakes Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/gup.c | 51 ++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index d70f8f0613f4cc..59e182634ba8cc 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1282,16 +1282,53 @@ static long __get_user_pages(struct mm_struct *mm, goto out; } next_page: - if (pages) { - pages[i] = page; - flush_anon_page(vma, page, start); - flush_dcache_page(page); - ctx.page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask); if (page_increm > nr_pages) page_increm = nr_pages; + + if (pages) { + struct page *subpage; + unsigned int j; + + /* + * This must be a large folio (and doesn't need to + * be the whole folio; it can be part of it), do + * the refcount work for all the subpages too. + * + * NOTE: here the page may not be the head page + * e.g. when start addr is not thp-size aligned. + * try_grab_folio() should have taken care of tail + * pages. + */ + if (page_increm > 1) { + struct folio *folio; + + /* + * Since we already hold refcount on the + * large folio, this should never fail. + */ + folio = try_grab_folio(page, page_increm - 1, + foll_flags); + if (WARN_ON_ONCE(!folio)) { + /* + * Release the 1st page ref if the + * folio is problematic, fail hard. + */ + gup_put_folio(page_folio(page), 1, + foll_flags); + ret = -EFAULT; + goto out; + } + } + + for (j = 0; j < page_increm; j++) { + subpage = nth_page(page, j); + pages[i + j] = subpage; + flush_anon_page(vma, subpage, start + j * PAGE_SIZE); + flush_dcache_page(subpage); + } + } + i += page_increm; start += page_increm * PAGE_SIZE; nr_pages -= page_increm; From 4849807114b83e1897381ed3f851632f376a0b7e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:08 -0400 Subject: [PATCH 019/489] mm/gup: retire follow_hugetlb_page() Now __get_user_pages() should be well prepared to handle thp completely, as long as hugetlb gup requests even without the hugetlb's special path. Time to retire follow_hugetlb_page(). Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal. Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 2 +- include/linux/hugetlb.h | 12 --- mm/gup.c | 19 ---- mm/hugetlb.c | 224 ---------------------------------------- 4 files changed, 1 insertion(+), 256 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 7cecd49e078b3d..ae711f1d7a8308 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * * We also don't do userfault handling during * coredumping. hugetlbfs has the special - * follow_hugetlb_page() to skip missing pages in the + * hugetlb_follow_page_mask() to skip missing pages in the * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9f282f370d964f..9bc3c2d71b71b7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask); -long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, - struct page **, unsigned long *, unsigned long *, - long, unsigned int, int *); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *, zap_flags_t); @@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask( BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/ } -static inline long follow_hugetlb_page(struct mm_struct *mm, - struct vm_area_struct *vma, struct page **pages, - unsigned long *position, unsigned long *nr_pages, - long i, unsigned int flags, int *nonblocking) -{ - BUG(); - return 0; -} - static inline int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *dst_vma, diff --git a/mm/gup.c b/mm/gup.c index 59e182634ba8cc..2493ffa10f4bf9 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -819,9 +819,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma, * Call hugetlb_follow_page_mask for hugetlb vmas as it will use * special hugetlb page table walking code. This eliminates the * need to check for hugetlb entries in the general walking code. - * - * hugetlb_follow_page_mask is only for follow_page() handling here. - * Ordinary GUP uses follow_hugetlb_page for hugetlb processing. */ if (is_vm_hugetlb_page(vma)) return hugetlb_follow_page_mask(vma, address, flags, @@ -1221,22 +1218,6 @@ static long __get_user_pages(struct mm_struct *mm, ret = check_vma_flags(vma, gup_flags); if (ret) goto out; - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, - &start, &nr_pages, i, - gup_flags, locked); - if (!*locked) { - /* - * We've got a VM_FAULT_RETRY - * and we've lost mmap_lock. - * We must stop here. - */ - BUG_ON(gup_flags & FOLL_NOWAIT); - goto out; - } - continue; - } } retry: /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab52214b5a75e9..e3839eee465790 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5721,7 +5721,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* * Return whether there is a pagecache page to back given address within VMA. - * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. */ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -6422,37 +6421,6 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, } #endif /* CONFIG_USERFAULTFD */ -static void record_subpages(struct page *page, struct vm_area_struct *vma, - int refs, struct page **pages) -{ - int nr; - - for (nr = 0; nr < refs; nr++) { - if (likely(pages)) - pages[nr] = nth_page(page, nr); - } -} - -static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma, - unsigned int flags, pte_t *pte, - bool *unshare) -{ - pte_t pteval = huge_ptep_get(pte); - - *unshare = false; - if (is_swap_pte(pteval)) - return true; - if (huge_pte_write(pteval)) - return false; - if (flags & FOLL_WRITE) - return true; - if (gup_must_unshare(vma, flags, pte_page(pteval))) { - *unshare = true; - return true; - } - return false; -} - struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned int *page_mask) @@ -6524,198 +6492,6 @@ struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma, return page; } -long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, - struct page **pages, unsigned long *position, - unsigned long *nr_pages, long i, unsigned int flags, - int *locked) -{ - unsigned long pfn_offset; - unsigned long vaddr = *position; - unsigned long remainder = *nr_pages; - struct hstate *h = hstate_vma(vma); - int err = -EFAULT, refs; - - while (vaddr < vma->vm_end && remainder) { - pte_t *pte; - spinlock_t *ptl = NULL; - bool unshare = false; - int absent; - struct page *page; - - /* - * If we have a pending SIGKILL, don't keep faulting pages and - * potentially allocating memory. - */ - if (fatal_signal_pending(current)) { - remainder = 0; - break; - } - - hugetlb_vma_lock_read(vma); - /* - * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make sure we get the - * first, for the page indexing below to work. - * - * Note that page table lock is not held when pte is null. - */ - pte = hugetlb_walk(vma, vaddr & huge_page_mask(h), - huge_page_size(h)); - if (pte) - ptl = huge_pte_lock(h, mm, pte); - absent = !pte || huge_pte_none(huge_ptep_get(pte)); - - /* - * When coredumping, it suits get_dump_page if we just return - * an error where there's an empty slot with no huge pagecache - * to back it. This way, we avoid allocating a hugepage, and - * the sparse dumpfile avoids allocating disk blocks, but its - * huge holes still show up with zeroes where they need to be. - */ - if (absent && (flags & FOLL_DUMP) && - !hugetlbfs_pagecache_present(h, vma, vaddr)) { - if (pte) - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - remainder = 0; - break; - } - - /* - * We need call hugetlb_fault for both hugepages under migration - * (in which case hugetlb_fault waits for the migration,) and - * hwpoisoned hugepages (in which case we need to prevent the - * caller from accessing to them.) In order to do this, we use - * here is_swap_pte instead of is_hugetlb_entry_migration and - * is_hugetlb_entry_hwpoisoned. This is because it simply covers - * both cases, and because we can't follow correct pages - * directly from any kind of swap entries. - */ - if (absent || - __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) { - vm_fault_t ret; - unsigned int fault_flags = 0; - - if (pte) - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - - if (flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - else if (unshare) - fault_flags |= FAULT_FLAG_UNSHARE; - if (locked) { - fault_flags |= FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_KILLABLE; - if (flags & FOLL_INTERRUPTIBLE) - fault_flags |= FAULT_FLAG_INTERRUPTIBLE; - } - if (flags & FOLL_NOWAIT) - fault_flags |= FAULT_FLAG_ALLOW_RETRY | - FAULT_FLAG_RETRY_NOWAIT; - if (flags & FOLL_TRIED) { - /* - * Note: FAULT_FLAG_ALLOW_RETRY and - * FAULT_FLAG_TRIED can co-exist - */ - fault_flags |= FAULT_FLAG_TRIED; - } - ret = hugetlb_fault(mm, vma, vaddr, fault_flags); - if (ret & VM_FAULT_ERROR) { - err = vm_fault_to_errno(ret, flags); - remainder = 0; - break; - } - if (ret & VM_FAULT_RETRY) { - if (locked && - !(fault_flags & FAULT_FLAG_RETRY_NOWAIT)) - *locked = 0; - *nr_pages = 0; - /* - * VM_FAULT_RETRY must not return an - * error, it will return zero - * instead. - * - * No need to update "position" as the - * caller will not check it after - * *nr_pages is set to 0. - */ - return i; - } - continue; - } - - pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT; - page = pte_page(huge_ptep_get(pte)); - - VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && - !PageAnonExclusive(page), page); - - /* - * If subpage information not requested, update counters - * and skip the same_page loop below. - */ - if (!pages && !pfn_offset && - (vaddr + huge_page_size(h) < vma->vm_end) && - (remainder >= pages_per_huge_page(h))) { - vaddr += huge_page_size(h); - remainder -= pages_per_huge_page(h); - i += pages_per_huge_page(h); - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - continue; - } - - /* vaddr may not be aligned to PAGE_SIZE */ - refs = min3(pages_per_huge_page(h) - pfn_offset, remainder, - (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); - - if (pages) - record_subpages(nth_page(page, pfn_offset), - vma, refs, - likely(pages) ? pages + i : NULL); - - if (pages) { - /* - * try_grab_folio() should always succeed here, - * because: a) we hold the ptl lock, and b) we've just - * checked that the huge page is present in the page - * tables. If the huge page is present, then the tail - * pages must also be present. The ptl prevents the - * head page and tail pages from being rearranged in - * any way. As this is hugetlb, the pages will never - * be p2pdma or not longterm pinable. So this page - * must be available at this point, unless the page - * refcount overflowed: - */ - if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs, - flags))) { - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - remainder = 0; - err = -ENOMEM; - break; - } - } - - vaddr += (refs << PAGE_SHIFT); - remainder -= refs; - i += refs; - - spin_unlock(ptl); - hugetlb_vma_unlock_read(vma); - } - *nr_pages = remainder; - /* - * setting position is actually required only if remainder is - * not zero but it's faster not to add a "if (remainder)" - * branch. - */ - *position = vaddr; - - return i ? i : err; -} - long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot, unsigned long cp_flags) From 2bc48136224592fe4f8f50629d0469d6d0274207 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:09 -0400 Subject: [PATCH 020/489] selftests/mm: add -a to run_vmtests.sh Allows to specify optional tests in run_vmtests.sh, where we can run time consuming test matrix only when user specified "-a". Link: https://lkml.kernel.org/r/20230628215310.73782-8-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 3f26f6e15b2a49..824e651f62f4d8 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -12,11 +12,14 @@ exitcode=0 usage() { cat <"] +usage: ${BASH_SOURCE[0]:-$0} [ options ] + + -a: run all tests, including extra ones -t: specify specific categories to tests to run -h: display this message -The default behavior is to run all tests. +The default behavior is to run required tests only. If -a is specified, +will run all tests. Alternatively, specific groups tests can be run by passing a string to the -t argument containing one or more of the following categories @@ -60,9 +63,11 @@ EOF exit 0 } +RUN_ALL=false -while getopts "ht:" OPT; do +while getopts "aht:" OPT; do case ${OPT} in + "a") RUN_ALL=true ;; "h") usage ;; "t") VM_SELFTEST_ITEMS=${OPTARG} ;; esac From de4ec376df7bd2db4aa887b350a681686ea54064 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Wed, 28 Jun 2023 17:53:10 -0400 Subject: [PATCH 021/489] selftests/mm: add gup test matrix in run_vmtests.sh Add a matrix for testing gup based on the current gup_test. Only run the matrix when -a is specified because it's a bit slow. It covers: - Different types of huge pages: thp, hugetlb, or no huge page - Permissions: Write / Read-only - Fast-gup, with/without - Types of the GUP: pin / gup / longterm pins - Shared / Private memories - GUP size: 1 / 512 / random page sizes Link: https://lkml.kernel.org/r/20230628215310.73782-9-peterx@redhat.com Signed-off-by: Peter Xu Acked-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: James Houghton Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kirill A . Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 37 ++++++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 824e651f62f4d8..9666c0c171ab86 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -90,6 +90,30 @@ test_selected() { fi } +run_gup_matrix() { + # -t: thp=on, -T: thp=off, -H: hugetlb=on + local hugetlb_mb=$(( needmem_KB / 1024 )) + + for huge in -t -T "-H -m $hugetlb_mb"; do + # -u: gup-fast, -U: gup-basic, -a: pin-fast, -b: pin-basic, -L: pin-longterm + for test_cmd in -u -U -a -b -L; do + # -w: write=1, -W: write=0 + for write in -w -W; do + # -S: shared + for share in -S " "; do + # -n: How many pages to fetch together? 512 is special + # because it's default thp size (or 2M on x86), 123 to + # just test partial gup when hit a huge in whatever form + for num in "-n 1" "-n 512" "-n 123"; do + CATEGORY="gup_test" run_test ./gup_test \ + $huge $test_cmd $write $share $num + done + done + done + done + done +} + # get huge pagesize and freepages from /proc/meminfo while read -r name size unit; do if [ "$name" = "HugePages_Free:" ]; then @@ -194,13 +218,16 @@ fi CATEGORY="mmap" run_test ./map_fixed_noreplace -# get_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -u -# pin_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -a +if $RUN_ALL; then + run_gup_matrix +else + # get_user_pages_fast() benchmark + CATEGORY="gup_test" run_test ./gup_test -u + # pin_user_pages_fast() benchmark + CATEGORY="gup_test" run_test ./gup_test -a +fi # Dump pages 0, 19, and 4096, using pin_user_pages: CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 - CATEGORY="gup_test" run_test ./gup_longterm CATEGORY="userfaultfd" run_test ./uffd-unit-tests From f04d16ee3afc049cdfa99500d95dee8b0eb77cfa Mon Sep 17 00:00:00 2001 From: Haibo Li Date: Wed, 28 Jun 2023 19:02:20 +0800 Subject: [PATCH 022/489] mm/filemap.c: fix update prev_pos after one read request done ra->prev_pos tracks the last visited byte in the previous read request. It is used to check whether it is sequential read in ondemand_readahead and thus affects the readahead window. After commit 06c0444290ce ("mm/filemap.c: generic_file_buffered_read() now uses find_get_pages_contig"), update logic of prev_pos is changed. It updates prev_pos after each return from filemap_get_pages(). But the read request from user may be not fully completed at this point. The updated prev_pos impacts the subsequent readahead window. The real problem is performance drop of fsck_msdos between linux-5.4 and linux-5.15(also linux-6.4). Comparing to linux-5.4,It spends about 110% time and read 140% pages. The read pattern of fsck_msdos is not fully sequential. Simplified read pattern of fsck_msdos likes below: 1.read at page offset 0xa,size 0x1000 2.read at other page offset like 0x20,size 0x1000 3.read at page offset 0xa,size 0x4000 4.read at page offset 0xe,size 0x1000 Here is the read status on linux-6.4: 1.after read at page offset 0xa,size 0x1000 ->page ofs 0xa go into pagecache 2.after read at page offset 0x20,size 0x1000 ->page ofs 0x20 go into pagecache 3.read at page offset 0xa,size 0x4000 ->filemap_get_pages read ofs 0xa from pagecache and returns ->prev_pos is updated to 0xb and goto next loop ->filemap_get_pages tends to read ofs 0xb,size 0x3000 ->initial_readahead case in ondemand_readahead since prev_pos is the same as request ofs. ->read 8 pages while async size is 5 pages (PageReadahead flag at page 0xe) 4.read at page offset 0xe,size 0x1000 ->hit page 0xe with PageReadahead flag set,double the ra_size. read 16 pages while async size is 16 pages Now it reads 24 pages while actually uses 5 pages on linux-5.4: 1.the same as 6.4 2.the same as 6.4 3.read at page offset 0xa,size 0x4000 ->read ofs 0xa from pagecache ->read ofs 0xb,size 0x3000 using page_cache_sync_readahead read 3 pages ->prev_pos is updated to 0xd before generic_file_buffered_read returns 4.read at page offset 0xe,size 0x1000 ->initial_readahead case in ondemand_readahead since request ofs-prev_pos==1 ->read 4 pages while async size is 3 pages Now it reads 7 pages while actually uses 5 pages. In above demo, the initial_readahead case is triggered by offset of user request on linux-5.4. While it may be triggered by update logic of prev_pos on linux-6.4. To fix the performance drop, update prev_pos after finishing one read request. Link: https://lkml.kernel.org/r/20230628110220.120134-1-haibo.li@mediatek.com Signed-off-by: Haibo Li Reviewed-by: Jan Kara Cc: AngeloGioacchino Del Regno Cc: Matthew Wilcox Cc: Matthias Brugger Signed-off-by: Andrew Morton --- mm/filemap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c5e2c70ea04687..93e495d2d47749 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2632,6 +2632,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, int i, error = 0; bool writably_mapped; loff_t isize, end_offset; + loff_t last_pos = ra->prev_pos; if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) return 0; @@ -2682,8 +2683,8 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, * When a read accesses the same folio several times, only * mark it as accessed the first time. */ - if (!pos_same_folio(iocb->ki_pos, ra->prev_pos - 1, - fbatch.folios[0])) + if (!pos_same_folio(iocb->ki_pos, last_pos - 1, + fbatch.folios[0])) folio_mark_accessed(fbatch.folios[0]); for (i = 0; i < folio_batch_count(&fbatch); i++) { @@ -2710,7 +2711,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, already_read += copied; iocb->ki_pos += copied; - ra->prev_pos = iocb->ki_pos; + last_pos = iocb->ki_pos; if (copied < bytes) { error = -EFAULT; @@ -2724,7 +2725,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter, } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); file_accessed(filp); - + ra->prev_pos = last_pos; return already_read ? already_read : error; } EXPORT_SYMBOL_GPL(filemap_read); From d6e8d0dc19a3ebea185cd8e99f2e960d81b153ad Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:54 +0800 Subject: [PATCH 023/489] maple_tree: add test for mas_wr_modify() fast path Patch series "Optimize the fast path of mas_store()", v4. Add fast paths for mas_wr_append() and mas_wr_slot_store() respectively. The newly added fast path of mas_wr_append() is used in fork() and how much it benefits fork() depends on how many VMAs are duplicated. Thanks Liam for the review. This patch (of 4): Add tests for all cases of mas_wr_append() and mas_wr_slot_store(). Link: https://lkml.kernel.org/r/20230628073657.75314-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230628073657.75314-2-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 65 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 8d4c92cbdd0cfa..3207c21079184a 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1157,6 +1157,71 @@ static noinline void __init check_ranges(struct maple_tree *mt) MT_BUG_ON(mt, !mt_height(mt)); mtree_destroy(mt); + /* Check in-place modifications */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + /* Append to the start of last range */ + mt_set_non_kernel(50); + for (i = 0; i <= 500; i++) { + val = i * 5 + 1; + val2 = val + 4; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Append to the last range without touching any boundaries */ + for (i = 0; i < 10; i++) { + val = val2 + 5; + val2 = val + 4; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Append to the end of last range */ + val = val2; + for (i = 0; i < 10; i++) { + val += 5; + MT_BUG_ON(mt, mtree_test_store_range(mt, val, ULONG_MAX, + xa_mk_value(val)) != 0); + } + + /* Overwriting the range and over a part of the next range */ + for (i = 10; i < 30; i += 2) { + val = i * 5 + 1; + val2 = val + 5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* Overwriting a part of the range and over the next range */ + for (i = 50; i < 70; i += 2) { + val2 = i * 5; + val = val2 - 5; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* + * Expand the range, only partially overwriting the previous and + * next ranges + */ + for (i = 100; i < 130; i += 3) { + val = i * 5 - 5; + val2 = i * 5 + 1; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + /* + * Expand the range, only partially overwriting the previous and + * next ranges, in RCU mode + */ + mt_set_in_rcu(mt); + for (i = 150; i < 180; i += 3) { + val = i * 5 - 5; + val2 = i * 5 + 1; + check_store_range(mt, val, val2, xa_mk_value(val), 0); + } + + MT_BUG_ON(mt, !mt_height(mt)); + mt_validate(mt); + mt_set_non_kernel(0); + mtree_destroy(mt); + /* Test rebalance gaps */ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); mt_set_non_kernel(50); From c38d9ff2cc6777f7dc5c02fcb4842402ab5f362a Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:55 +0800 Subject: [PATCH 024/489] maple_tree: add test for expanding range in RCU mode Add test for expanding range in RCU mode. If we use the fast path of the slot store to expand range in RCU mode, this test will fail. Link: https://lkml.kernel.org/r/20230628073657.75314-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 75 ++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 75ea2081a317e5..9901ae821911a2 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -45,6 +45,13 @@ struct rcu_test_struct2 { unsigned long last[RCU_RANGE_COUNT]; }; +struct rcu_test_struct3 { + struct maple_tree *mt; + unsigned long index; + unsigned long last; + bool stop; +}; + struct rcu_reader_struct { unsigned int id; int mod; @@ -34954,6 +34961,70 @@ void run_check_rcu(struct maple_tree *mt, struct rcu_test_struct *vals) MT_BUG_ON(mt, !vals->seen_entry2); } +static void *rcu_slot_store_reader(void *ptr) +{ + struct rcu_test_struct3 *test = ptr; + MA_STATE(mas, test->mt, test->index, test->index); + + rcu_register_thread(); + + rcu_read_lock(); + while (!test->stop) { + mas_walk(&mas); + /* The length of growth to both sides must be equal. */ + RCU_MT_BUG_ON(test, (test->index - mas.index) != + (mas.last - test->last)); + } + rcu_read_unlock(); + + rcu_unregister_thread(); + return NULL; +} + +static noinline void run_check_rcu_slot_store(struct maple_tree *mt) +{ + pthread_t readers[20]; + int range_cnt = 200, i, limit = 10000; + unsigned long len = ULONG_MAX / range_cnt, start, end; + struct rcu_test_struct3 test = {.stop = false, .mt = mt}; + + start = range_cnt / 2 * len; + end = start + len - 1; + test.index = start; + test.last = end; + + for (i = 0; i < range_cnt; i++) { + mtree_store_range(mt, i * len, i * len + len - 1, + xa_mk_value(i * 100), GFP_KERNEL); + } + + mt_set_in_rcu(mt); + MT_BUG_ON(mt, !mt_in_rcu(mt)); + + for (i = 0; i < ARRAY_SIZE(readers); i++) { + if (pthread_create(&readers[i], NULL, rcu_slot_store_reader, + &test)) { + perror("creating reader thread"); + exit(1); + } + } + + usleep(5); + + while (limit--) { + /* Step by step, expand the most middle range to both sides. */ + mtree_store_range(mt, --start, ++end, xa_mk_value(100), + GFP_KERNEL); + } + + test.stop = true; + + while (i--) + pthread_join(readers[i], NULL); + + mt_validate(mt); +} + static noinline void run_check_rcu_slowread(struct maple_tree *mt, struct rcu_test_struct *vals) { @@ -35206,6 +35277,10 @@ static noinline void __init check_rcu_threaded(struct maple_tree *mt) run_check_rcu(mt, &vals); mtree_destroy(mt); + /* Check expanding range in RCU mode */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + run_check_rcu_slot_store(mt); + mtree_destroy(mt); /* Forward writer for rcu stress */ mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); From 23e9dde0b246d47e4a1942ea50bf7fef63e2d41a Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:56 +0800 Subject: [PATCH 025/489] maple_tree: optimize mas_wr_append(), also improve duplicating VMAs When the new range can be completely covered by the original last range without touching the boundaries on both sides, two new entries can be appended to the end as a fast path. We update the original last pivot at the end, and the newly appended two entries will not be accessed before this, so it is also safe in RCU mode. This is useful for sequential insertion, which is what we do in dup_mmap(). Enabling BENCH_FORK in test_maple_tree and just running bench_forking() gives the following time-consuming numbers: before: after: 17,874.83 msec 15,738.38 msec It shows about a 12% performance improvement for duplicating VMAs. Link: https://lkml.kernel.org/r/20230628073657.75314-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f512bb9766aad1..3e0c91c017813c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4267,10 +4267,10 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * * Return: True if appended, false otherwise */ -static inline bool mas_wr_append(struct ma_wr_state *wr_mas) +static inline bool mas_wr_append(struct ma_wr_state *wr_mas, + unsigned char new_end) { unsigned char end = wr_mas->node_end; - unsigned char new_end = end + 1; struct ma_state *mas = wr_mas->mas; unsigned char node_pivots = mt_pivots[wr_mas->type]; @@ -4282,16 +4282,27 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas) ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); } - if (mas->last == wr_mas->r_max) { - /* Append to end of range */ - rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->entry); - wr_mas->pivots[end] = mas->index - 1; - mas->offset = new_end; + if (new_end == wr_mas->node_end + 1) { + if (mas->last == wr_mas->r_max) { + /* Append to end of range */ + rcu_assign_pointer(wr_mas->slots[new_end], + wr_mas->entry); + wr_mas->pivots[end] = mas->index - 1; + mas->offset = new_end; + } else { + /* Append to start of range */ + rcu_assign_pointer(wr_mas->slots[new_end], + wr_mas->content); + wr_mas->pivots[end] = mas->last; + rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + } } else { - /* Append to start of range */ + /* Append to the range without touching any boundaries. */ rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content); - wr_mas->pivots[end] = mas->last; - rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + wr_mas->pivots[end + 1] = mas->last; + rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry); + wr_mas->pivots[end] = mas->index - 1; + mas->offset = end + 1; } if (!wr_mas->content || !wr_mas->entry) @@ -4338,7 +4349,7 @@ static inline void mas_wr_modify(struct ma_wr_state *wr_mas) goto slow_path; /* Attempt to append */ - if (new_end == wr_mas->node_end + 1 && mas_wr_append(wr_mas)) + if (mas_wr_append(wr_mas, new_end)) return; if (new_end == wr_mas->node_end && mas_wr_slot_store(wr_mas)) From 64891ba3e51fb841b0af70db029038eb93bd5a43 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Wed, 28 Jun 2023 15:36:57 +0800 Subject: [PATCH 026/489] maple_tree: add a fast path case in mas_wr_slot_store() When expanding a range in two directions, only partially overwriting the previous and next ranges, the number of entries will not be increased, so we can just update the pivots as a fast path. However, it may introduce potential risks in RCU mode, because it updates two pivots. We only enable it in non-RCU mode. Link: https://lkml.kernel.org/r/20230628073657.75314-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3e0c91c017813c..7fad4a7a0b058c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4168,23 +4168,35 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; + void __rcu **slots = wr_mas->slots; bool gap = false; - if (wr_mas->offset_end - offset != 1) - return false; - - gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset); - gap |= !mt_slot_locked(mas->tree, wr_mas->slots, offset + 1); + gap |= !mt_slot_locked(mas->tree, slots, offset); + gap |= !mt_slot_locked(mas->tree, slots, offset + 1); - if (mas->index == wr_mas->r_min) { - /* Overwriting the range and over a part of the next range. */ - rcu_assign_pointer(wr_mas->slots[offset], wr_mas->entry); - wr_mas->pivots[offset] = mas->last; - } else { - /* Overwriting a part of the range and over the next range */ - rcu_assign_pointer(wr_mas->slots[offset + 1], wr_mas->entry); + if (wr_mas->offset_end - offset == 1) { + if (mas->index == wr_mas->r_min) { + /* Overwriting the range and a part of the next one */ + rcu_assign_pointer(slots[offset], wr_mas->entry); + wr_mas->pivots[offset] = mas->last; + } else { + /* Overwriting a part of the range and the next one */ + rcu_assign_pointer(slots[offset + 1], wr_mas->entry); + wr_mas->pivots[offset] = mas->index - 1; + mas->offset++; /* Keep mas accurate. */ + } + } else if (!mt_in_rcu(mas->tree)) { + /* + * Expand the range, only partially overwriting the previous and + * next ranges + */ + gap |= !mt_slot_locked(mas->tree, slots, offset + 2); + rcu_assign_pointer(slots[offset + 1], wr_mas->entry); wr_mas->pivots[offset] = mas->index - 1; + wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ + } else { + return false; } trace_ma_write(__func__, mas, 0, wr_mas->entry); From b7b618da0edc85280e1c9c8f4f5239571e7c1d3e Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Wed, 28 Jun 2023 09:49:29 +0800 Subject: [PATCH 027/489] mm: memory-failure: remove unneeded page state check in shake_page() Remove unneeded PageLRU(p) and is_free_buddy_page(p) check as slab caches are not shrunk now. This check can be added back when a lightweight range based shrinker is available. Link: https://lkml.kernel.org/r/20230628014929.3441386-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ece5d481b5fff9..1ddb25a1073ea6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -363,17 +363,14 @@ void shake_page(struct page *p) { if (PageHuge(p)) return; - - if (!PageSlab(p)) { - lru_add_drain_all(); - if (PageLRU(p) || is_free_buddy_page(p)) - return; - } - /* * TODO: Could shrink slab caches here if a lightweight range-based * shrinker will be available. */ + if (PageSlab(p)) + return; + + lru_add_drain_all(); } EXPORT_SYMBOL_GPL(shake_page); From f58d7907a39f8d0381db56af4a590915fd94ee58 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 26 Jun 2023 20:10:53 +0800 Subject: [PATCH 028/489] memory tier: use helper function destroy_memory_type() Use helper function destroy_memory_type() to release memtype instead of open code it to help improve code readability a bit. No functional change intended. Link: https://lkml.kernel.org/r/20230626121053.1916447-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Aneesh Kumar K.V Cc: "Huang, Ying" Cc: Wei Xu Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index a516e303e30435..1719fa3bcf0244 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) */ if (!node_memory_types[node].map_count) { node_memory_types[node].memtype = NULL; - kref_put(&memtype->kref, release_memtype); + destroy_memory_type(memtype); } mutex_unlock(&memory_tier_lock); } From 1a7d018dc38b6851c602b448bdac2e78b46857db Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 26 Jun 2023 19:43:43 +0800 Subject: [PATCH 029/489] mm: memory-failure: remove unneeded 'inline' annotation Remove unneeded 'inline' annotation from num_poisoned_pages_inc() and num_poisoned_pages_sub(). No functional change intended. Link: https://lkml.kernel.org/r/20230626114343.1846587-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1ddb25a1073ea6..9d87f0b8b8057c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -75,13 +75,13 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; -inline void num_poisoned_pages_inc(unsigned long pfn) +void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); memblk_nr_poison_inc(pfn); } -inline void num_poisoned_pages_sub(unsigned long pfn, long i) +void num_poisoned_pages_sub(unsigned long pfn, long i) { atomic_long_sub(i, &num_poisoned_pages); if (pfn != -1UL) From 489b7e72a63cbda11b3a2cd6c895b22917d53065 Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Mon, 26 Jun 2023 07:55:17 +0200 Subject: [PATCH 030/489] fs/buffer: clean up block_commit_write MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Originally inode is used to get blksize, after commit 45bce8f3e343 ("fs/buffer.c: make block-size be per-page and protected by the page lock"), __block_commit_write no longer uses this parameter inode. [akpm@linux-foundation.org: remove now-unused local `inode'] Link: https://lkml.kernel.org/r/20230626055518.842392-2-beanhuo@iokpp.de Signed-off-by: Bean Huo Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Cc: Al Viro Cc: Andreas Dilger Cc: Christian Brauner Cc: Joel Becker Cc: Joseph Qi Cc: Luís Henriques Cc: Mark Fasheh Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/buffer.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index bd091329026c0f..a23af7826d1c98 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2180,8 +2180,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct inode *inode, struct folio *folio, - size_t from, size_t to) +static int __block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2253,7 +2252,6 @@ int block_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); - struct inode *inode = mapping->host; size_t start = pos - folio_pos(folio); if (unlikely(copied < len)) { @@ -2277,7 +2275,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(inode, folio, start, start + copied); + __block_commit_write(folio, start, start + copied); return copied; } @@ -2601,8 +2599,7 @@ EXPORT_SYMBOL(cont_write_begin); int block_commit_write(struct page *page, unsigned from, unsigned to) { struct folio *folio = page_folio(page); - struct inode *inode = folio->mapping->host; - __block_commit_write(inode, folio, from, to); + __block_commit_write(folio, from, to); return 0; } EXPORT_SYMBOL(block_commit_write); @@ -2650,7 +2647,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, ret = __block_write_begin_int(folio, 0, end, get_block, NULL); if (!ret) - ret = __block_commit_write(inode, folio, 0, end); + ret = __block_commit_write(folio, 0, end); if (unlikely(ret < 0)) goto out_unlock; From a524fcfe190da16bbf1311b6636f51d81f35d59a Mon Sep 17 00:00:00 2001 From: Bean Huo Date: Mon, 26 Jun 2023 07:55:18 +0200 Subject: [PATCH 031/489] fs: convert block_commit_write to return void MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit block_commit_write() always returns 0, this patch changes it to return void. Link: https://lkml.kernel.org/r/20230626055518.842392-3-beanhuo@iokpp.de Signed-off-by: Bean Huo Reviewed-by: Jan Kara Acked-by: Theodore Ts'o Reviewed-by: Christoph Hellwig Reviewed-by: Matthew Wilcox (Oracle) Cc: Al Viro Cc: Andreas Dilger Cc: Christian Brauner Cc: Joel Becker Cc: Joseph Qi Cc: Luís Henriques Cc: Mark Fasheh Signed-off-by: Andrew Morton --- fs/buffer.c | 14 ++++++-------- fs/ext4/move_extent.c | 7 ++----- fs/ocfs2/file.c | 7 +------ fs/udf/file.c | 6 +++--- include/linux/buffer_head.h | 2 +- 5 files changed, 13 insertions(+), 23 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index a23af7826d1c98..587e4d4af9deef 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2180,7 +2180,7 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static int __block_commit_write(struct folio *folio, size_t from, size_t to) +static void __block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2215,7 +2215,6 @@ static int __block_commit_write(struct folio *folio, size_t from, size_t to) */ if (!partial) folio_mark_uptodate(folio); - return 0; } /* @@ -2596,11 +2595,10 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -int block_commit_write(struct page *page, unsigned from, unsigned to) +void block_commit_write(struct page *page, unsigned from, unsigned to) { struct folio *folio = page_folio(page); __block_commit_write(folio, from, to); - return 0; } EXPORT_SYMBOL(block_commit_write); @@ -2646,11 +2644,11 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, end = size - folio_pos(folio); ret = __block_write_begin_int(folio, 0, end, get_block, NULL); - if (!ret) - ret = __block_commit_write(folio, 0, end); - - if (unlikely(ret < 0)) + if (unlikely(ret)) goto out_unlock; + + __block_commit_write(folio, 0, end); + folio_mark_dirty(folio); folio_wait_stable(folio); return 0; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index b5af2fc03b2f1b..f4b4861a74ee19 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -392,14 +392,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) - break; + goto repair_branches; bh = bh->b_this_page; } - if (!*err) - *err = block_commit_write(&folio[0]->page, from, from + replaced_size); - if (unlikely(*err < 0)) - goto repair_branches; + block_commit_write(&folio[0]->page, from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 91a194596552e2..9e417cd4fd16fe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -808,12 +808,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - ret = block_commit_write(page, block_start + 1, - block_start + 1); - if (ret < 0) - mlog_errno(ret); - else - ret = 0; + block_commit_write(page, block_start + 1, block_start + 1); } /* diff --git a/fs/udf/file.c b/fs/udf/file.c index 243840dc83addf..0292d75e60ccd5 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -63,13 +63,13 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) else end = PAGE_SIZE; err = __block_write_begin(page, 0, end, udf_get_block); - if (!err) - err = block_commit_write(page, 0, end); - if (err < 0) { + if (err) { unlock_page(page); ret = block_page_mkwrite_return(err); goto out_unlock; } + + block_commit_write(page, 0, end); out_dirty: set_page_dirty(page); wait_for_stable_page(page); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 6cb3e9af78c9ed..a7377877ff4ed7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -288,7 +288,7 @@ int cont_write_begin(struct file *, struct address_space *, loff_t, unsigned, struct page **, void **, get_block_t *, loff_t *); int generic_cont_expand_simple(struct inode *inode, loff_t size); -int block_commit_write(struct page *page, unsigned from, unsigned to); +void block_commit_write(struct page *page, unsigned int from, unsigned int to); int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block); /* Convert errno to return value from ->page_mkwrite() call */ From 416ef04fe00c5f2f6fb8e13d8dbe1b5a0a274f83 Mon Sep 17 00:00:00 2001 From: liuq Date: Sun, 25 Jun 2023 11:16:56 +0800 Subject: [PATCH 032/489] mm/page_alloc: fix min_free_kbytes calculation regarding ZONE_MOVABLE The current calculation of min_free_kbytes only uses ZONE_DMA and ZONE_NORMAL pages,but the ZONE_MOVABLE zone->_watermark[WMARK_MIN] will also divide part of min_free_kbytes.This will cause the min watermark of ZONE_NORMAL to be too small in the presence of ZONE_MOVEABLE. __GFP_HIGH and PF_MEMALLOC allocations usually don't need movable zone pages, so just like ZONE_HIGHMEM, cap pages_min to a small value in __setup_per_zone_wmarks(). On my testing machine with 16GB of memory (transparent hugepage is turned off by default, and movablecore=12G is configured) The following is a comparative test data of watermark_min no patch add patch ZONE_DMA 1 8 ZONE_DMA32 151 709 ZONE_NORMAL 233 1113 ZONE_MOVABLE 1434 128 min_free_kbytes 7288 7326 Link: https://lkml.kernel.org/r/20230625031656.23941-1-liuq131@chinatelecom.cn Signed-off-by: liuq Reviewed-by: "Huang, Ying" Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d3460c7a480b5..fe04c4e85c42dc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5694,9 +5694,9 @@ static void __setup_per_zone_wmarks(void) struct zone *zone; unsigned long flags; - /* Calculate total number of !ZONE_HIGHMEM pages */ + /* Calculate total number of !ZONE_HIGHMEM and !ZONE_MOVABLE pages */ for_each_zone(zone) { - if (!is_highmem(zone)) + if (!is_highmem(zone) && zone_idx(zone) != ZONE_MOVABLE) lowmem_pages += zone_managed_pages(zone); } @@ -5706,15 +5706,15 @@ static void __setup_per_zone_wmarks(void) spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone_managed_pages(zone); do_div(tmp, lowmem_pages); - if (is_highmem(zone)) { + if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't - * need highmem pages, so cap pages_min to a small - * value here. + * need highmem and movable zones pages, so cap pages_min + * to a small value here. * * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN) * deltas control async page reclaim, and so should - * not be capped for highmem. + * not be capped for highmem and movable zones. */ unsigned long min_pages; From 3fade62b62e84dd8dbf6e92d494b0e7eca750c43 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 25 Jun 2023 10:13:23 +0800 Subject: [PATCH 033/489] mm/mm_init.c: remove obsolete macro HASH_SMALL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HASH_SMALL only works when parameter numentries is 0. But the sole caller futex_init() never calls alloc_large_system_hash() with numentries set to 0. So HASH_SMALL is obsolete and remove it. Link: https://lkml.kernel.org/r/20230625021323.849147-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Mike Rapoport (IBM) Cc: André Almeida Cc: Darren Hart Cc: Davidlohr Bueso Cc: Ingo Molnar Cc: Miaohe Lin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/memblock.h | 4 +--- kernel/futex/core.c | 3 +-- mm/mm_init.c | 10 +--------- 3 files changed, 3 insertions(+), 14 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f71ff9f0ec81ad..0d031fbfea2537 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -581,9 +581,7 @@ extern void *alloc_large_system_hash(const char *tablename, unsigned long high_limit); #define HASH_EARLY 0x00000001 /* Allocating during early boot? */ -#define HASH_SMALL 0x00000002 /* sub-page allocation allowed, min - * shift passed via *_hash_shift */ -#define HASH_ZERO 0x00000004 /* Zero allocated hash table */ +#define HASH_ZERO 0x00000002 /* Zero allocated hash table */ /* Only NUMA needs hash distribution. 64bit NUMA architectures have * sufficient vmalloc space. diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 514e4582b86341..f10587d1d48170 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1132,8 +1132,7 @@ static int __init futex_init(void) #endif futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), - futex_hashsize, 0, - futex_hashsize < 256 ? HASH_SMALL : 0, + futex_hashsize, 0, 0, &futex_shift, NULL, futex_hashsize, futex_hashsize); futex_hashsize = 1UL << futex_shift; diff --git a/mm/mm_init.c b/mm/mm_init.c index d356ba59ef2a3b..f90db54e2b21d1 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2489,15 +2489,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); - /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely(flags & HASH_SMALL)) { - /* Makes no sense without HASH_EARLY */ - WARN_ON(!(flags & HASH_EARLY)); - if (!(numentries >> *_hash_shift)) { - numentries = 1UL << *_hash_shift; - BUG_ON(!numentries); - } - } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); From df9cd3cbf209b9357f2939ba63afa427026e954b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 24 Jun 2023 14:12:14 +0900 Subject: [PATCH 034/489] zsmalloc: do not scan for allocated objects in empty zspage Patch series "zsmalloc: small compaction improvements", v2. A tiny series that can reduce the number of find_alloced_obj() invocations (which perform a linear scan of sub-page) during compaction. Inspired by Alexey Romanov's findings. This patch (of 3): zspage migration can terminate as soon as it moves the last allocated object from the source zspage. Add a simple helper zspage_empty() that tests zspage ->inuse on each migration iteration. Link: https://lkml.kernel.org/r/20230624053120.643409-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Suggested-by: Alexey Romanov Reviewed-by: Alexey Romanov Acked-by: Minchan Kim Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 3f057970504e3b..5d60eaedc3b794 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1147,6 +1147,11 @@ static bool zspage_full(struct size_class *class, struct zspage *zspage) return get_zspage_inuse(zspage) == class->objs_per_zspage; } +static bool zspage_empty(struct zspage *zspage) +{ + return get_zspage_inuse(zspage) == 0; +} + /** * zs_lookup_class_index() - Returns index of the zsmalloc &size_class * that hold objects of the provided size. @@ -1625,6 +1630,10 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, obj_idx++; record_obj(handle, free_obj); obj_free(class->size, used_obj); + + /* Stop if there are no more objects to migrate */ + if (zspage_empty(get_zspage(s_page))) + break; } /* Remember last position in this iteration */ From 4ce36584da19ff6a5d171a317c6c34c567d4628e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sat, 24 Jun 2023 14:12:15 +0900 Subject: [PATCH 035/489] zsmalloc: move migration destination zspage inuse check Destination zspage fullness check need to be done after zs_object_copy() because that's where source and destination zspages fullness change. Checking destination zspage fullness before zs_object_copy() may cause migration to loop through source zspage sub-pages scanning for allocate objects just to find out at the end that the destination zspage is full. Link: https://lkml.kernel.org/r/20230624053120.643409-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Alexey Romanov Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5d60eaedc3b794..4a84f787766912 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1620,10 +1620,6 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, continue; } - /* Stop if there is no more space */ - if (zspage_full(class, get_zspage(d_page))) - break; - used_obj = handle_to_obj(handle); free_obj = obj_malloc(pool, get_zspage(d_page), handle); zs_object_copy(class, free_obj, used_obj); @@ -1631,6 +1627,10 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, record_obj(handle, free_obj); obj_free(class->size, used_obj); + /* Stop if there is no more space */ + if (zspage_full(class, get_zspage(d_page))) + break; + /* Stop if there are no more objects to migrate */ if (zspage_empty(get_zspage(s_page))) break; From ada5caed79b313b3046839c9de9bf9048561e4bb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Sat, 24 Jun 2023 14:12:16 +0900 Subject: [PATCH 036/489] zsmalloc: remove zs_compact_control __zs_compact always putback src_zspage into class list after migrate_zspage. Thus, we don't need to keep last position of src_zspage any more. Let's remove it. Link: https://lkml.kernel.org/r/20230624053120.643409-4-senozhatsky@chromium.org Signed-off-by: Minchan Kim Signed-off-by: Sergey Senozhatsky Acked-by: Sergey Senozhatsky Cc: Alexey Romanov Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 37 +++++++++---------------------------- 1 file changed, 9 insertions(+), 28 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 4a84f787766912..84beadc088b833 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1590,25 +1590,14 @@ static unsigned long find_alloced_obj(struct size_class *class, return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); } -struct zs_compact_control { - /* Source spage for migration which could be a subpage of zspage */ - struct page *s_page; - /* Destination page for migration which should be a first page - * of zspage. */ - struct page *d_page; - /* Starting object index within @s_page which used for live object - * in the subpage. */ - int obj_idx; -}; - -static void migrate_zspage(struct zs_pool *pool, struct size_class *class, - struct zs_compact_control *cc) +static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, + struct zspage *dst_zspage) { unsigned long used_obj, free_obj; unsigned long handle; - struct page *s_page = cc->s_page; - struct page *d_page = cc->d_page; - int obj_idx = cc->obj_idx; + int obj_idx = 0; + struct page *s_page = get_first_page(src_zspage); + struct size_class *class = pool->size_class[src_zspage->class]; while (1) { handle = find_alloced_obj(class, s_page, &obj_idx); @@ -1621,24 +1610,20 @@ static void migrate_zspage(struct zs_pool *pool, struct size_class *class, } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(pool, get_zspage(d_page), handle); + free_obj = obj_malloc(pool, dst_zspage, handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); obj_free(class->size, used_obj); /* Stop if there is no more space */ - if (zspage_full(class, get_zspage(d_page))) + if (zspage_full(class, dst_zspage)) break; /* Stop if there are no more objects to migrate */ - if (zspage_empty(get_zspage(s_page))) + if (zspage_empty(src_zspage)) break; } - - /* Remember last position in this iteration */ - cc->s_page = s_page; - cc->obj_idx = obj_idx; } static struct zspage *isolate_src_zspage(struct size_class *class) @@ -2013,7 +1998,6 @@ static unsigned long zs_can_compact(struct size_class *class) static unsigned long __zs_compact(struct zs_pool *pool, struct size_class *class) { - struct zs_compact_control cc; struct zspage *src_zspage = NULL; struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; @@ -2031,7 +2015,6 @@ static unsigned long __zs_compact(struct zs_pool *pool, if (!dst_zspage) break; migrate_write_lock(dst_zspage); - cc.d_page = get_first_page(dst_zspage); } src_zspage = isolate_src_zspage(class); @@ -2040,9 +2023,7 @@ static unsigned long __zs_compact(struct zs_pool *pool, migrate_write_lock_nested(src_zspage); - cc.obj_idx = 0; - cc.s_page = get_first_page(src_zspage); - migrate_zspage(pool, class, &cc); + migrate_zspage(pool, src_zspage, dst_zspage); fg = putback_zspage(class, src_zspage); migrate_write_unlock(src_zspage); From a2ebb51575828209b3e9d6f3c6576f7a7c70d0f6 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 23 Jun 2023 22:15:17 +0200 Subject: [PATCH 037/489] mm/page_alloc: use write_seqlock_irqsave() instead write_seqlock() + local_irq_save(). __build_all_zonelists() acquires zonelist_update_seq by first disabling interrupts via local_irq_save() and then acquiring the seqlock with write_seqlock(). This is troublesome and leads to problems on PREEMPT_RT. The problem is that the inner spinlock_t becomes a sleeping lock on PREEMPT_RT and must not be acquired with disabled interrupts. The API provides write_seqlock_irqsave() which does the right thing in one step. printk_deferred_enter() has to be invoked in non-migrate-able context to ensure that deferred printing is enabled and disabled on the same CPU. This is the case after zonelist_update_seq has been acquired. There was discussion on the first submission that the order should be: local_irq_disable(); printk_deferred_enter(); write_seqlock(); to avoid pitfalls like having an unaccounted printk() coming from write_seqlock_irqsave() before printk_deferred_enter() is invoked. The only origin of such a printk() can be a lockdep splat because the lockdep annotation happens after the sequence count is incremented. This is exceptional and subject to change. It was also pointed that PREEMPT_RT can be affected by the printk problem since its write_seqlock_irqsave() does not really disable interrupts. This isn't the case because PREEMPT_RT's printk implementation differs from the mainline implementation in two important aspects: - Printing happens in a dedicated threads and not at during the invocation of printk(). - In emergency cases where synchronous printing is used, a different driver is used which does not use tty_port::lock. Acquire zonelist_update_seq with write_seqlock_irqsave() and then defer printk output. Link: https://lkml.kernel.org/r/20230623201517.yw286Knb@linutronix.de Fixes: 1007843a91909 ("mm/page_alloc: fix potential deadlock on zonelist_update_seq seqlock") Signed-off-by: Sebastian Andrzej Siewior Acked-by: Michal Hocko Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Cc: Boqun Feng Cc: Ingo Molnar Cc: John Ogness Cc: Luis Claudio R. Goncalves Cc: Mel Gorman Cc: Peter Zijlstra Cc: Petr Mladek Cc: Tetsuo Handa Cc: Thomas Gleixner Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/page_alloc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe04c4e85c42dc..b51bbc485a2870 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5139,19 +5139,17 @@ static void __build_all_zonelists(void *data) unsigned long flags; /* - * Explicitly disable this CPU's interrupts before taking seqlock - * to prevent any IRQ handler from calling into the page allocator - * (e.g. GFP_ATOMIC) that could hit zonelist_iter_begin and livelock. + * The zonelist_update_seq must be acquired with irqsave because the + * reader can be invoked from IRQ with GFP_ATOMIC. */ - local_irq_save(flags); + write_seqlock_irqsave(&zonelist_update_seq, flags); /* - * Explicitly disable this CPU's synchronous printk() before taking - * seqlock to prevent any printk() from trying to hold port->lock, for + * Also disable synchronous printk() to prevent any printk() from + * trying to hold port->lock, for * tty_insert_flip_string_and_push_buffer() on other CPU might be * calling kmalloc(GFP_ATOMIC | __GFP_NOWARN) with port->lock held. */ printk_deferred_enter(); - write_seqlock(&zonelist_update_seq); #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); @@ -5188,9 +5186,8 @@ static void __build_all_zonelists(void *data) #endif } - write_sequnlock(&zonelist_update_seq); printk_deferred_exit(); - local_irq_restore(flags); + write_sequnlock_irqrestore(&zonelist_update_seq, flags); } static noinline void __init From fe3b1bf19bdf122ec3d3bff03329df2044ae0f18 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Wed, 21 Jun 2023 17:35:46 +0200 Subject: [PATCH 038/489] selftests: cgroup: add test_zswap program Patch series "selftests: cgroup: add zswap test program". This series adds 2 zswap related selftests that verify known and fixed issues. A new dedicated test program (test_zswap) is proposed since the test cases are specific to zswap and hosts specific helpers. The first patch adds the (empty) test program, while the other 2 add an actual test function each. This patch (of 3): Add empty cgroup-zswap self test scaffold program, test functions to be added in the next commits. Link: https://lkml.kernel.org/r/20230621153548.428093-1-cerasuolodomenico@gmail.com Link: https://lkml.kernel.org/r/20230621153548.428093-2-cerasuolodomenico@gmail.com Signed-off-by: Domenico Cerasuolo Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Nhat Pham Cc: Rik van Riel Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun Heo Cc: Vitaly Wool Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/.gitignore | 1 + tools/testing/selftests/cgroup/Makefile | 2 + tools/testing/selftests/cgroup/test_zswap.c | 62 +++++++++++++++++++++ 3 files changed, 65 insertions(+) create mode 100644 tools/testing/selftests/cgroup/test_zswap.c diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index c4a57e69f749e3..4d556df4f77b62 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -5,4 +5,5 @@ test_freezer test_kmem test_kill test_cpu +test_zswap wait_inotify diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index 3d263747d2ad0b..27dbdd7bb4bb69 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -12,6 +12,7 @@ TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_freezer TEST_GEN_PROGS += test_kill TEST_GEN_PROGS += test_cpu +TEST_GEN_PROGS += test_zswap LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h @@ -23,3 +24,4 @@ $(OUTPUT)/test_core: cgroup_util.c $(OUTPUT)/test_freezer: cgroup_util.c $(OUTPUT)/test_kill: cgroup_util.c $(OUTPUT)/test_cpu: cgroup_util.c +$(OUTPUT)/test_zswap: cgroup_util.c diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c new file mode 100644 index 00000000000000..00153366732249 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE + +#include +#include +#include + +#include "../kselftest.h" +#include "cgroup_util.h" + +#define T(x) { x, #x } +struct zswap_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { +}; +#undef T + +static bool zswap_configured(void) +{ + return access("/sys/module/zswap", F_OK) == 0; +} + +int main(int argc, char **argv) +{ + char root[PATH_MAX]; + int i, ret = EXIT_SUCCESS; + + if (cg_find_unified_root(root, sizeof(root))) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + + if (!zswap_configured()) + ksft_exit_skip("zswap isn't configured\n"); + + /* + * Check that memory controller is available: + * memory is listed in cgroup.controllers + */ + if (cg_read_strstr(root, "cgroup.controllers", "memory")) + ksft_exit_skip("memory controller isn't available\n"); + + if (cg_read_strstr(root, "cgroup.subtree_control", "memory")) + if (cg_write(root, "cgroup.subtree_control", "+memory")) + ksft_exit_skip("Failed to set memory controller\n"); + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + return ret; +} From a549f9f31561a569e3c7a4cb6226a0b31d35fc89 Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Wed, 21 Jun 2023 17:35:47 +0200 Subject: [PATCH 039/489] selftests: cgroup: add test_zswap with no kmem bypass test Add a cgroup selftest that verifies memcg charging in zswap. The original issue was that kmem bypass was applied to pages swapped out to zswap by kswapd, resulting in zswapped memory not being charged. It was fixed by commit cd08d80ecdac("mm: correctly charge compressed memory to its memcg"). Link: https://lkml.kernel.org/r/20230621153548.428093-3-cerasuolodomenico@gmail.com Signed-off-by: Domenico Cerasuolo Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Nhat Pham Cc: Rik van Riel Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun Heo Cc: Vitaly Wool Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 163 ++++++++++++++++++++ 1 file changed, 163 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index 00153366732249..e859fecd310b74 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -4,15 +4,178 @@ #include #include #include +#include +#include +#include +#include +#include #include "../kselftest.h" #include "cgroup_util.h" +static int read_int(const char *path, size_t *value) +{ + FILE *file; + int ret = 0; + + file = fopen(path, "r"); + if (!file) + return -1; + if (fscanf(file, "%ld", value) != 1) + ret = -1; + fclose(file); + return ret; +} + +static int set_min_free_kb(size_t value) +{ + FILE *file; + int ret; + + file = fopen("/proc/sys/vm/min_free_kbytes", "w"); + if (!file) + return -1; + ret = fprintf(file, "%ld\n", value); + fclose(file); + return ret; +} + +static int read_min_free_kb(size_t *value) +{ + return read_int("/proc/sys/vm/min_free_kbytes", value); +} + +static int get_zswap_stored_pages(size_t *value) +{ + return read_int("/sys/kernel/debug/zswap/stored_pages", value); +} + +struct no_kmem_bypass_child_args { + size_t target_alloc_bytes; + size_t child_allocated; +}; + +static int no_kmem_bypass_child(const char *cgroup, void *arg) +{ + struct no_kmem_bypass_child_args *values = arg; + void *allocation; + + allocation = malloc(values->target_alloc_bytes); + if (!allocation) { + values->child_allocated = true; + return -1; + } + for (long i = 0; i < values->target_alloc_bytes; i += 4095) + ((char *)allocation)[i] = 'a'; + values->child_allocated = true; + pause(); + free(allocation); + return 0; +} + +/* + * When pages owned by a memcg are pushed to zswap by kswapd, they should be + * charged to that cgroup. This wasn't the case before commit + * cd08d80ecdac("mm: correctly charge compressed memory to its memcg"). + * + * The test first allocates memory in a memcg, then raises min_free_kbytes to + * a very high value so that the allocation falls below low wm, then makes + * another allocation to trigger kswapd that should push the memcg-owned pages + * to zswap and verifies that the zswap pages are correctly charged. + * + * To be run on a VM with at most 4G of memory. + */ +static int test_no_kmem_bypass(const char *root) +{ + size_t min_free_kb_high, min_free_kb_low, min_free_kb_original; + struct no_kmem_bypass_child_args *values; + size_t trigger_allocation_size; + int wait_child_iteration = 0; + long stored_pages_threshold; + struct sysinfo sys_info; + int ret = KSFT_FAIL; + int child_status; + char *test_group; + pid_t child_pid; + + /* Read sys info and compute test values accordingly */ + if (sysinfo(&sys_info) != 0) + return KSFT_FAIL; + if (sys_info.totalram > 5000000000) + return KSFT_SKIP; + values = mmap(0, sizeof(struct no_kmem_bypass_child_args), PROT_READ | + PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (values == MAP_FAILED) + return KSFT_FAIL; + if (read_min_free_kb(&min_free_kb_original)) + return KSFT_FAIL; + min_free_kb_high = sys_info.totalram / 2000; + min_free_kb_low = sys_info.totalram / 500000; + values->target_alloc_bytes = (sys_info.totalram - min_free_kb_high * 1000) + + sys_info.totalram * 5 / 100; + stored_pages_threshold = sys_info.totalram / 5 / 4096; + trigger_allocation_size = sys_info.totalram / 20; + + /* Set up test memcg */ + if (cg_write(root, "cgroup.subtree_control", "+memory")) + goto out; + test_group = cg_name(root, "kmem_bypass_test"); + if (!test_group) + goto out; + + /* Spawn memcg child and wait for it to allocate */ + set_min_free_kb(min_free_kb_low); + if (cg_create(test_group)) + goto out; + values->child_allocated = false; + child_pid = cg_run_nowait(test_group, no_kmem_bypass_child, values); + if (child_pid < 0) + goto out; + while (!values->child_allocated && wait_child_iteration++ < 10000) + usleep(1000); + + /* Try to wakeup kswapd and let it push child memory to zswap */ + set_min_free_kb(min_free_kb_high); + for (int i = 0; i < 20; i++) { + size_t stored_pages; + char *trigger_allocation = malloc(trigger_allocation_size); + + if (!trigger_allocation) + break; + for (int i = 0; i < trigger_allocation_size; i += 4095) + trigger_allocation[i] = 'b'; + usleep(100000); + free(trigger_allocation); + if (get_zswap_stored_pages(&stored_pages)) + break; + if (stored_pages < 0) + break; + /* If memory was pushed to zswap, verify it belongs to memcg */ + if (stored_pages > stored_pages_threshold) { + int zswapped = cg_read_key_long(test_group, "memory.stat", "zswapped "); + int delta = stored_pages * 4096 - zswapped; + int result_ok = delta < stored_pages * 4096 / 4; + + ret = result_ok ? KSFT_PASS : KSFT_FAIL; + break; + } + } + + kill(child_pid, SIGTERM); + waitpid(child_pid, &child_status, 0); +out: + set_min_free_kb(min_free_kb_original); + cg_destroy(test_group); + free(test_group); + return ret; +} + #define T(x) { x, #x } struct zswap_test { int (*fn)(const char *root); const char *name; } tests[] = { + T(test_no_kmem_bypass), }; #undef T From d9cfaf405b8ffe2c716b1ce4c82e0a19d50951da Mon Sep 17 00:00:00 2001 From: Domenico Cerasuolo Date: Wed, 21 Jun 2023 17:35:48 +0200 Subject: [PATCH 040/489] selftests: cgroup: add zswap-memcg unwanted writeback test Add a test to verify that when a memcg hits its limit in zswap, it doesn't trigger an unwanted writeback that would result in pages not owned by that memcg to be sent to disk, even if zswap isn't full. This was fixed by commit 0bdf0efa180a("zswap: do not shrink if cgroup may not zswap"). Link: https://lkml.kernel.org/r/20230621153548.428093-4-cerasuolodomenico@gmail.com Signed-off-by: Domenico Cerasuolo Cc: Dan Streetman Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Nhat Pham Cc: Rik van Riel Cc: Roman Gushchin Cc: Seth Jennings Cc: Shakeel Butt Cc: Shuah Khan Cc: Tejun Heo Cc: Vitaly Wool Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_zswap.c | 61 +++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_zswap.c b/tools/testing/selftests/cgroup/test_zswap.c index e859fecd310b74..49def87a909bdb 100644 --- a/tools/testing/selftests/cgroup/test_zswap.c +++ b/tools/testing/selftests/cgroup/test_zswap.c @@ -50,6 +50,66 @@ static int get_zswap_stored_pages(size_t *value) return read_int("/sys/kernel/debug/zswap/stored_pages", value); } +static int get_zswap_written_back_pages(size_t *value) +{ + return read_int("/sys/kernel/debug/zswap/written_back_pages", value); +} + +static int allocate_bytes(const char *cgroup, void *arg) +{ + size_t size = (size_t)arg; + char *mem = (char *)malloc(size); + + if (!mem) + return -1; + for (int i = 0; i < size; i += 4095) + mem[i] = 'a'; + free(mem); + return 0; +} + +/* + * When trying to store a memcg page in zswap, if the memcg hits its memory + * limit in zswap, writeback should not be triggered. + * + * This was fixed with commit 0bdf0efa180a("zswap: do not shrink if cgroup may + * not zswap"). Needs to be revised when a per memcg writeback mechanism is + * implemented. + */ +static int test_no_invasive_cgroup_shrink(const char *root) +{ + size_t written_back_before, written_back_after; + int ret = KSFT_FAIL; + char *test_group; + + /* Set up */ + test_group = cg_name(root, "no_shrink_test"); + if (!test_group) + goto out; + if (cg_create(test_group)) + goto out; + if (cg_write(test_group, "memory.max", "1M")) + goto out; + if (cg_write(test_group, "memory.zswap.max", "10K")) + goto out; + if (get_zswap_written_back_pages(&written_back_before)) + goto out; + + /* Allocate 10x memory.max to push memory into zswap */ + if (cg_run(test_group, allocate_bytes, (void *)MB(10))) + goto out; + + /* Verify that no writeback happened because of the memcg allocation */ + if (get_zswap_written_back_pages(&written_back_after)) + goto out; + if (written_back_after == written_back_before) + ret = KSFT_PASS; +out: + cg_destroy(test_group); + free(test_group); + return ret; +} + struct no_kmem_bypass_child_args { size_t target_alloc_bytes; size_t child_allocated; @@ -176,6 +236,7 @@ struct zswap_test { const char *name; } tests[] = { T(test_no_kmem_bypass), + T(test_no_invasive_cgroup_shrink), }; #undef T From df263d9a7dffee94ca5391120ee3b0587efa07f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Wed, 7 Jun 2023 20:29:44 +0300 Subject: [PATCH 041/489] mm/migrate_device: try to handle swapcache pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrating file pages and swapcache pages into device memory is not supported. Try to get rid of the swap cache, and if successful, go ahead as with other anonymous pages. Link: https://lkml.kernel.org/r/20230607172944.11713-1-mpenttil@redhat.com Signed-off-by: Mika Penttilä Reviewed-by: "Huang, Ying" Reviewed-by: Alistair Popple Cc: John Hubbard Cc: Ralph Campbell Signed-off-by: Andrew Morton --- mm/migrate_device.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8365158460ed17..e29626e1329e97 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -727,13 +727,22 @@ static void __migrate_device_pages(unsigned long *src_pfns, if (is_device_private_page(newpage) || is_device_coherent_page(newpage)) { - /* - * For now only support anonymous memory migrating to - * device private or coherent memory. - */ if (mapping) { - src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; - continue; + struct folio *folio; + + folio = page_folio(page); + + /* + * For now only support anonymous memory migrating to + * device private or coherent memory. + * + * Try to get rid of swap cache if possible. + */ + if (!folio_test_anon(folio) || + !folio_free_swap(folio)) { + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; + continue; + } } } else if (is_zone_device_page(newpage)) { /* From 79271476b3362a9e69adae949a520647f8af3559 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:28 +0800 Subject: [PATCH 042/489] ksm: support unsharing KSM-placed zero pages Patch series "ksm: support tracking KSM-placed zero-pages", v10. The core idea of this patch set is to enable users to perceive the number of any pages merged by KSM, regardless of whether use_zero_page switch has been turned on, so that users can know how much free memory increase is really due to their madvise(MERGEABLE) actions. But the problem is, when enabling use_zero_pages, all empty pages will be merged with kernel zero pages instead of with each other as use_zero_pages is disabled, and then these zero-pages are no longer monitored by KSM. The motivations to do this is seen at: https://lore.kernel.org/lkml/202302100915227721315@zte.com.cn/ In one word, we hope to implement the support for KSM-placed zero pages tracking without affecting the feature of use_zero_pages, so that app developer can also benefit from knowing the actual KSM profit by getting KSM-placed zero pages to optimize applications eventually when /sys/kernel/mm/ksm/use_zero_pages is enabled. This patch (of 5): When use_zero_pages of ksm is enabled, madvise(addr, len, MADV_UNMERGEABLE) and other ways (like write 2 to /sys/kernel/mm/ksm/run) to trigger unsharing will *not* actually unshare the shared zeropage as placed by KSM (which is against the MADV_UNMERGEABLE documentation). As these KSM-placed zero pages are out of the control of KSM, the related counts of ksm pages don't expose how many zero pages are placed by KSM (these special zero pages are different from those initially mapped zero pages, because the zero pages mapped to MADV_UNMERGEABLE areas are expected to be a complete and unshared page). To not blindly unshare all shared zero_pages in applicable VMAs, the patch use pte_mkdirty (related with architecture) to mark KSM-placed zero pages. Thus, MADV_UNMERGEABLE will only unshare those KSM-placed zero pages. In addition, we'll reuse this mechanism to reliably identify KSM-placed ZeroPages to properly account for them (e.g., calculating the KSM profit that includes zeropages) in the latter patches. The patch will not degrade the performance of use_zero_pages as it doesn't change the way of merging empty pages in use_zero_pages's feature. Link: https://lkml.kernel.org/r/202306131104554703428@zte.com.cn Link: https://lkml.kernel.org/r/20230613030928.185882-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: Andrew Morton --- include/linux/ksm.h | 6 ++++++ mm/ksm.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 899a314bc48720..98878107244fde 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -26,6 +26,12 @@ int ksm_disable(struct mm_struct *mm); int __ksm_enter(struct mm_struct *mm); void __ksm_exit(struct mm_struct *mm); +/* + * To identify zeropages that were mapped by KSM, we reuse the dirty bit + * in the PTE. If the PTE is dirty, the zeropage was mapped by KSM when + * deduplicating memory. + */ +#define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { diff --git a/mm/ksm.c b/mm/ksm.c index ba266359da55f1..99519e22a7616a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -448,7 +448,8 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex if (is_migration_entry(entry)) page = pfn_swap_entry_to_page(entry); } - ret = page && PageKsm(page); + /* return 1 if the page is an normal ksm page or KSM-placed zero page */ + ret = (page && PageKsm(page)) || is_ksm_zero_pte(*pte); pte_unmap_unlock(pte, ptl); return ret; } @@ -1222,8 +1223,12 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, page_add_anon_rmap(kpage, vma, addr, RMAP_NONE); newpte = mk_pte(kpage, vma->vm_page_prot); } else { - newpte = pte_mkspecial(pfn_pte(page_to_pfn(kpage), - vma->vm_page_prot)); + /* + * Use pte_mkdirty to mark the zero page mapped by KSM, and then + * we can easily track all KSM-placed zero pages by checking if + * the dirty bit in zero page's PTE is set. + */ + newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we From e2942062e01df85b4692460fe5b48ab0c90fdb95 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:34 +0800 Subject: [PATCH 043/489] ksm: count all zero pages placed by KSM As pages_sharing and pages_shared don't include the number of zero pages merged by KSM, we cannot know how many pages are zero pages placed by KSM when enabling use_zero_pages, which leads to KSM not being transparent with all actual merged pages by KSM. In the early days of use_zero_pages, zero-pages was unable to get unshared by the ways like MADV_UNMERGEABLE so it's hard to count how many times one of those zeropages was then unmerged. But now, unsharing KSM-placed zero page accurately has been achieved, so we can easily count both how many times a page full of zeroes was merged with zero-page and how many times one of those pages was then unmerged. and so, it helps to estimate memory demands when each and every shared page could get unshared. So we add ksm_zero_pages under /sys/kernel/mm/ksm/ to show the number of all zero pages placed by KSM. Meanwhile, we update the Documentation. Link: https://lkml.kernel.org/r/20230613030934.185944-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Claudio Imbrenda Cc: Xuexin Jiang Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 7 +++++++ include/linux/ksm.h | 12 ++++++++++++ mm/khugepaged.c | 2 ++ mm/ksm.c | 12 ++++++++++++ mm/memory.c | 5 ++++- 5 files changed, 37 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 7626392fe82cb5..6cc919dbfd55b3 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains the number of KSM pages that hit the ``max_page_sharing`` limit stable_node_dups number of duplicated KSM pages +ksm_zero_pages + how many zero pages that are still mapped into processes were mapped by + KSM when deduplicating. + +When ``use_zero_pages`` is/was enabled, the sum of ``pages_sharing`` + +``ksm_zero_pages`` represents the actual number of pages saved by KSM. +if ``use_zero_pages`` has never been enabled, ``ksm_zero_pages`` is 0. A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing`` diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 98878107244fde..e80aa49009b262 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -33,6 +33,14 @@ void __ksm_exit(struct mm_struct *mm); */ #define is_ksm_zero_pte(pte) (is_zero_pfn(pte_pfn(pte)) && pte_dirty(pte)) +extern unsigned long ksm_zero_pages; + +static inline void ksm_might_unmap_zero_page(pte_t pte) +{ + if (is_ksm_zero_pte(pte)) + ksm_zero_pages--; +} + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) { int ret; @@ -101,6 +109,10 @@ static inline void ksm_exit(struct mm_struct *mm) { } +static inline void ksm_might_unmap_zero_page(pte_t pte) +{ +} + #ifdef CONFIG_MEMORY_FAILURE static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 78c8d5d8b62841..419981dcc889db 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -709,6 +710,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); + ksm_might_unmap_zero_page(pteval); } } else { src_page = pte_page(pteval); diff --git a/mm/ksm.c b/mm/ksm.c index 99519e22a7616a..e037d9aad691a0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -278,6 +278,9 @@ static unsigned int zero_checksum __read_mostly; /* Whether to merge empty (zeroed) pages with actual zero pages */ static bool ksm_use_zero_pages __read_mostly; +/* The number of zero pages which is placed by KSM */ +unsigned long ksm_zero_pages; + #ifdef CONFIG_NUMA /* Zeroed when merging across nodes is not allowed */ static unsigned int ksm_merge_across_nodes = 1; @@ -1229,6 +1232,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * the dirty bit in zero page's PTE is set. */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); + ksm_zero_pages++; /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we @@ -3356,6 +3360,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); +static ssize_t ksm_zero_pages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%ld\n", ksm_zero_pages); +} +KSM_ATTR_RO(ksm_zero_pages); + static ssize_t general_profit_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3423,6 +3434,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, + &ksm_zero_pages_attr.attr, &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/memory.c b/mm/memory.c index e9f9944c7370cc..c256da05bb5e41 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, tlb_remove_tlb_entry(tlb, pte, addr); zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); - if (unlikely(!page)) + if (unlikely(!page)) { + ksm_might_unmap_zero_page(ptent); continue; + } delay_rmap = 0; if (!PageAnon(page)) { @@ -3128,6 +3130,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } } else { + ksm_might_unmap_zero_page(vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); From 6080d19f07043ade61094d0f58b14c05e1694a39 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:38 +0800 Subject: [PATCH 044/489] ksm: add ksm zero pages for each process As the number of ksm zero pages is not included in ksm_merging_pages per process when enabling use_zero_pages, it's unclear of how many actual pages are merged by KSM. To let users accurately estimate their memory demands when unsharing KSM zero-pages, it's necessary to show KSM zero- pages per process. In addition, it help users to know the actual KSM profit because KSM-placed zero pages are also benefit from KSM. since unsharing zero pages placed by KSM accurately is achieved, then tracking empty pages merging and unmerging is not a difficult thing any longer. Since we already have /proc//ksm_stat, just add the information of 'ksm_zero_pages' in it. Link: https://lkml.kernel.org/r/20230613030938.185993-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: Claudio Imbrenda Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- fs/proc/base.c | 1 + include/linux/ksm.h | 8 +++++--- include/linux/mm_types.h | 9 +++++++-- mm/khugepaged.c | 2 +- mm/ksm.c | 1 + mm/memory.c | 4 ++-- 6 files changed, 17 insertions(+), 8 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 05452c3b9872bd..eb2e498e3b8de1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_zero_pages %lu\n", mm->ksm_zero_pages); seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e80aa49009b262..c2dd786a30e1f7 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -35,10 +35,12 @@ void __ksm_exit(struct mm_struct *mm); extern unsigned long ksm_zero_pages; -static inline void ksm_might_unmap_zero_page(pte_t pte) +static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { - if (is_ksm_zero_pte(pte)) + if (is_ksm_zero_pte(pte)) { ksm_zero_pages--; + mm->ksm_zero_pages--; + } } static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) @@ -109,7 +111,7 @@ static inline void ksm_exit(struct mm_struct *mm) { } -static inline void ksm_might_unmap_zero_page(pte_t pte) +static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e74ce4a28cd65..51d04c1847c113 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -812,7 +812,7 @@ struct mm_struct { #ifdef CONFIG_KSM /* * Represent how many pages of this process are involved in KSM - * merging. + * merging (not including ksm_zero_pages). */ unsigned long ksm_merging_pages; /* @@ -820,7 +820,12 @@ struct mm_struct { * including merged and not merged. */ unsigned long ksm_rmap_items; -#endif + /* + * Represent how many empty pages are merged with kernel zero + * pages when enabling KSM use_zero_pages. + */ + unsigned long ksm_zero_pages; +#endif /* CONFIG_KSM */ #ifdef CONFIG_LRU_GEN struct { /* this mm_struct is on lru_gen_mm_list */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 419981dcc889db..4b8b8673d5d9ff 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -710,7 +710,7 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte, spin_lock(ptl); ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); - ksm_might_unmap_zero_page(pteval); + ksm_might_unmap_zero_page(vma->vm_mm, pteval); } } else { src_page = pte_page(pteval); diff --git a/mm/ksm.c b/mm/ksm.c index e037d9aad691a0..e1772081e8cbfa 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1233,6 +1233,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, */ newpte = pte_mkdirty(pte_mkspecial(pfn_pte(page_to_pfn(kpage), vma->vm_page_prot))); ksm_zero_pages++; + mm->ksm_zero_pages++; /* * We're replacing an anonymous page with a zero page, which is * not anonymous. We need to do proper accounting otherwise we diff --git a/mm/memory.c b/mm/memory.c index c256da05bb5e41..5f863b1a0edc64 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1434,7 +1434,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent); if (unlikely(!page)) { - ksm_might_unmap_zero_page(ptent); + ksm_might_unmap_zero_page(mm, ptent); continue; } @@ -3130,7 +3130,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) inc_mm_counter(mm, MM_ANONPAGES); } } else { - ksm_might_unmap_zero_page(vmf->orig_pte); + ksm_might_unmap_zero_page(mm, vmf->orig_pte); inc_mm_counter(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); From 1a8e84305783bddbae708f28178c6d0aa6321913 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:42 +0800 Subject: [PATCH 045/489] ksm: consider KSM-placed zeropages when calculating KSM profit When use_zero_pages is enabled, the calculation of ksm profit is not correct because ksm zero pages is not counted in. So update the calculation of KSM profit including the documentation. Link: https://lkml.kernel.org/r/20230613030942.186041-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Cc: Xiaokai Ran Cc: Yang Yang Cc: Jiang Xuexin Cc: Claudio Imbrenda Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 18 +++++++++++------- mm/ksm.c | 4 ++-- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 6cc919dbfd55b3..5c5be7bd84b817 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -203,21 +203,25 @@ several times, which are unprofitable memory consumed. 1) How to determine whether KSM save memory or consume memory in system-wide range? Here is a simple approximate calculation for reference:: - general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * + general_profit =~ ksm_saved_pages * sizeof(page) - (all_rmap_items) * sizeof(rmap_item); - where all_rmap_items can be easily obtained by summing ``pages_sharing``, - ``pages_shared``, ``pages_unshared`` and ``pages_volatile``. + where ksm_saved_pages equals to the sum of ``pages_sharing`` + + ``ksm_zero_pages`` of the system, and all_rmap_items can be easily + obtained by summing ``pages_sharing``, ``pages_shared``, ``pages_unshared`` + and ``pages_volatile``. 2) The KSM profit inner a single process can be similarly obtained by the following approximate calculation:: - process_profit =~ ksm_merging_pages * sizeof(page) - + process_profit =~ ksm_saved_pages * sizeof(page) - ksm_rmap_items * sizeof(rmap_item). - where ksm_merging_pages is shown under the directory ``/proc//``, - and ksm_rmap_items is shown in ``/proc//ksm_stat``. The process profit - is also shown in ``/proc//ksm_stat`` as ksm_process_profit. + where ksm_saved_pages equals to the sum of ``ksm_merging_pages`` and + ``ksm_zero_pages``, both of which are shown under the directory + ``/proc//ksm_stat``, and ksm_rmap_items is also shown in + ``/proc//ksm_stat``. The process profit is also shown in + ``/proc//ksm_stat`` as ksm_process_profit. From the perspective of application, a high ratio of ``ksm_rmap_items`` to ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or diff --git a/mm/ksm.c b/mm/ksm.c index e1772081e8cbfa..97a9627116fa34 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -3092,7 +3092,7 @@ static void wait_while_offlining(void) #ifdef CONFIG_PROC_FS long ksm_process_profit(struct mm_struct *mm) { - return mm->ksm_merging_pages * PAGE_SIZE - + return (long)(mm->ksm_merging_pages + mm->ksm_zero_pages) * PAGE_SIZE - mm->ksm_rmap_items * sizeof(struct ksm_rmap_item); } #endif /* CONFIG_PROC_FS */ @@ -3373,7 +3373,7 @@ static ssize_t general_profit_show(struct kobject *kobj, { long general_profit; - general_profit = ksm_pages_sharing * PAGE_SIZE - + general_profit = (ksm_pages_sharing + ksm_zero_pages) * PAGE_SIZE - ksm_rmap_items * sizeof(struct ksm_rmap_item); return sysfs_emit(buf, "%ld\n", general_profit); From 3d0745e59c840e64c6b7bf102c43bec29337605d Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 13 Jun 2023 11:09:47 +0800 Subject: [PATCH 046/489] selftest: add a testcase of ksm zero pages Add a function test_unmerge_zero_page() to test the functionality on unsharing and counting ksm-placed zero pages and counting of this patch series. test_unmerge_zero_page() actually contains four subjct test objects: (1) whether the count of ksm zero pages can update correctly after merging; (2) whether the count of ksm zero pages can update correctly after unmerging by madvise(...MADV_UNMERGEABLE); (3) whether the count of ksm zero pages can update correctly after unmerging by triggering write fault. (4) whether ksm zero pages are really unmerged. Link: https://lkml.kernel.org/r/20230613030947.186089-1-yang.yang29@zte.com.cn Signed-off-by: xu xin Acked-by: David Hildenbrand Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Cc: Claudio Imbrenda Cc: Xuexin Jiang Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 98 ++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 26853badae7056..0de9d33cd565d8 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -29,6 +29,8 @@ static int ksm_fd; static int ksm_full_scans_fd; +static int proc_self_ksm_stat_fd; +static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; @@ -59,6 +61,33 @@ static bool range_maps_duplicates(char *addr, unsigned long size) return false; } +static long get_my_ksm_zero_pages(void) +{ + char buf[200]; + char *substr_ksm_zero; + size_t value_pos; + ssize_t read_size; + unsigned long my_ksm_zero_pages; + + if (!proc_self_ksm_stat_fd) + return 0; + + read_size = pread(proc_self_ksm_stat_fd, buf, sizeof(buf) - 1, 0); + if (read_size < 0) + return -errno; + + buf[read_size] = 0; + + substr_ksm_zero = strstr(buf, "ksm_zero_pages"); + if (!substr_ksm_zero) + return 0; + + value_pos = strcspn(substr_ksm_zero, "0123456789"); + my_ksm_zero_pages = strtol(substr_ksm_zero + value_pos, NULL, 10); + + return my_ksm_zero_pages; +} + static long ksm_get_full_scans(void) { char buf[10]; @@ -159,6 +188,70 @@ static void test_unmerge(void) munmap(map, size); } +static void test_unmerge_zero_pages(void) +{ + const unsigned int size = 2 * MiB; + char *map; + unsigned int offs; + unsigned long pages_expected; + + ksft_print_msg("[RUN] %s\n", __func__); + + if (proc_self_ksm_stat_fd < 0) { + ksft_test_result_skip("open(\"/proc/self/ksm_stat\") failed\n"); + return; + } + if (ksm_use_zero_pages_fd < 0) { + ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + + /* Let KSM deduplicate zero pages. */ + map = mmap_and_merge_range(0x00, size, false); + if (map == MAP_FAILED) + return; + + /* Check if ksm_zero_pages is updated correctly after KSM merging */ + pages_expected = size / pagesize; + if (pages_expected != get_my_ksm_zero_pages()) { + ksft_test_result_fail("'ksm_zero_pages' updated after merging\n"); + goto unmap; + } + + /* Try to unmerge half of the region */ + if (madvise(map, size / 2, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + /* Check if ksm_zero_pages is updated correctly after unmerging */ + pages_expected /= 2; + if (pages_expected != get_my_ksm_zero_pages()) { + ksft_test_result_fail("'ksm_zero_pages' updated after unmerging\n"); + goto unmap; + } + + /* Trigger unmerging of the other half by writing to the pages. */ + for (offs = size / 2; offs < size; offs += pagesize) + *((unsigned int *)&map[offs]) = offs; + + /* Now we should have no zeropages remaining. */ + if (get_my_ksm_zero_pages()) { + ksft_test_result_fail("'ksm_zero_pages' updated after write fault\n"); + goto unmap; + } + + /* Check if ksm zero pages are really unmerged */ + ksft_test_result(!range_maps_duplicates(map, size), + "KSM zero pages were unmerged\n"); +unmap: + munmap(map, size); +} + static void test_unmerge_discarded(void) { const unsigned int size = 2 * MiB; @@ -358,7 +451,7 @@ static void test_prctl_unmerge(void) int main(int argc, char **argv) { - unsigned int tests = 5; + unsigned int tests = 6; int err; #ifdef __NR_userfaultfd @@ -379,8 +472,11 @@ int main(int argc, char **argv) pagemap_fd = open("/proc/self/pagemap", O_RDONLY); if (pagemap_fd < 0) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); test_unmerge(); + test_unmerge_zero_pages(); test_unmerge_discarded(); #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); From 82d9b8c85b7e4bd85c679ac2da26b57224c4999d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 4 Jul 2023 19:18:23 +0800 Subject: [PATCH 047/489] mm: page_alloc: avoid false page outside zone error info If pfn is outside zone boundaries in the first round, ret will be set to 1. But if pfn is changed to inside the zone boundaries in zone span seqretry path, ret is still set to 1 leading to false page outside zone error info. This is from code inspection. The race window should be really small thus hard to trigger in real world. [akpm@linux-foundation.org: code simplification, per Matthew] Link: https://lkml.kernel.org/r/20230704111823.940331-1-linmiaohe@huawei.com Fixes: bdc8cb984576 ("[PATCH] memory hotplug locking: zone span seqlock") Signed-off-by: Miaohe Lin Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b51bbc485a2870..1eb3864e1dbc70 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -459,7 +459,7 @@ void set_pageblock_migratetype(struct page *page, int migratetype) #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { - int ret = 0; + int ret; unsigned seq; unsigned long pfn = page_to_pfn(page); unsigned long sp, start_pfn; @@ -468,8 +468,7 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) seq = zone_span_seqbegin(zone); start_pfn = zone->zone_start_pfn; sp = zone->spanned_pages; - if (!zone_spans_pfn(zone, pfn)) - ret = 1; + ret = !zone_spans_pfn(zone, pfn); } while (zone_span_seqretry(zone, seq)); if (ret) From 86327e8eb94c52eca4f93cfece2e29d1bf52acbf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 4 Jul 2023 13:52:40 +0200 Subject: [PATCH 048/489] memcg: drop kmem.limit_in_bytes kmem.limit_in_bytes (v1 way to limit kernel memory usage) has been deprecated since 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes") merged in 5.16. We haven't heard about any serious users since then but it seems that the mere presence of the file is causing more harm thatn good. We (SUSE) have had several bug reports from customers where Docker based containers started to fail because a write to kmem.limit_in_bytes has failed. This was unexpected because runc code only expects ENOENT (kmem disabled) or EBUSY (tasks already running within cgroup). So a new error code was unexpected and the whole container startup failed. This has been later addressed by https://github.com/opencontainers/runc/commit/52390d68040637dfc77f9fda6bbe70952423d380 so current Docker runtimes do not suffer from the problem anymore. There are still older version of Docker in use and likely hard to get rid of completely. Address this by wiping out the file completely and effectively get back to pre 4.5 era and CONFIG_MEMCG_KMEM=n configuration. I would recommend backporting to stable trees which have picked up 58056f77502f ("memcg, kmem: further deprecate kmem.limit_in_bytes"). [mhocko@suse.com: restore _KMEM switch case] Link: https://lkml.kernel.org/r/ZKe5wxdbvPi5Cwd7@dhcp22.suse.cz Link: https://lkml.kernel.org/r/20230704115240.14672-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Roman Gushchin Cc: Muchun Song Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 2 -- mm/memcontrol.c | 10 ---------- 2 files changed, 12 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index fabaad3fd9c212..8d3afeede10e46 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -92,8 +92,6 @@ Brief summary of control files. memory.oom_control set/show oom controls. memory.numa_stat show the number of memory usage per numa node - memory.kmem.limit_in_bytes This knob is deprecated and writing to - it will return -ENOTSUPP. memory.kmem.usage_in_bytes show current kernel memory allocation memory.kmem.failcnt show the number of kernel memory usage hits limits diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8ca4bdcb03cc1..ab99503c9ff2d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3871,10 +3871,6 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, case _MEMSWAP: ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; - case _KMEM: - /* kmem.limit_in_bytes is deprecated. */ - ret = -EOPNOTSUPP; - break; case _TCP: ret = memcg_update_tcp_max(memcg, nr_pages); break; @@ -5085,12 +5081,6 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memcg_numa_stat_show, }, #endif - { - .name = "kmem.limit_in_bytes", - .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), - .write = mem_cgroup_write, - .read_u64 = mem_cgroup_read_u64, - }, { .name = "kmem.usage_in_bytes", .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), From 8a144612eb8a31b94ecb2d340d07588ac115e819 Mon Sep 17 00:00:00 2001 From: Andrew Yang Date: Fri, 30 Jun 2023 17:22:02 +0800 Subject: [PATCH 049/489] fs: drop_caches: draining pages before dropping caches We expect a file page access after dropping caches should be a major fault, but sometimes it's still a minor fault. That's because a file page can't be dropped if it's in a per-cpu pagevec. Draining all pages from per-cpu pagevec to lru list before trying to drop caches. Link: https://lkml.kernel.org/r/20230630092203.16080-1-andrew.yang@mediatek.com Signed-off-by: Andrew Yang Cc: Al Viro Cc: AngeloGioacchino Del Regno Cc: Christian Brauner Cc: Matthias Brugger Signed-off-by: Andrew Morton --- fs/drop_caches.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/drop_caches.c b/fs/drop_caches.c index e619c31b6bd92a..b9575957a7c294 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -59,6 +60,7 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, static int stfu; if (sysctl_drop_caches & 1) { + lru_add_drain_all(); iterate_supers(drop_pagecache_sb, NULL); count_vm_event(DROP_PAGECACHE); } From 72de259130229412ca49871e70ffaf17dc9fba98 Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Wed, 5 Jul 2023 06:33:14 +0000 Subject: [PATCH 050/489] mm/memfd: sysctl: fix MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED Patch series "mm/memfd: fix sysctl MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED", v2. When sysctl vm.memfd_noexec is 2 (MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED), memfd_create(.., MFD_EXEC) should fail. This complies with how MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED is defined - "memfd_create() without MFD_NOEXEC_SEAL will be rejected" Thanks to Dominique Martinet who reported the bug. see [1] for context. [1] https://lore.kernel.org/linux-mm/CABi2SkXUX_QqTQ10Yx9bBUGpN1wByOi_=gZU6WEy5a8MaQY3Jw@mail.gmail.com/T/ This patch (of 2): When vm.memfd_noexec is 2 (MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED), memfd_create(.., MFD_EXEC) should fail. This complies with how MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED is defined - "memfd_create() without MFD_NOEXEC_SEAL will be rejected" Link: https://lkml.kernel.org/r/20230705063315.3680666-1-jeffxu@google.com Link: https://lkml.kernel.org/r/20230705063315.3680666-2-jeffxu@google.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Reported-by: Dominique Martinet Closes: https://lore.kernel.org/linux-mm/CABi2SkXUX_QqTQ10Yx9bBUGpN1wByOi_=gZU6WEy5a8MaQY3Jw@mail.gmail.com/T/ Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306301351.kkbSegQW-lkp@intel.com/ Signed-off-by: Jeff Xu Cc: Daniel Verkamp Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: Shuah Khan Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/memfd.c | 57 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/mm/memfd.c b/mm/memfd.c index e763e76f110645..0bdbd2335af751 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -268,6 +268,36 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) #define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB | MFD_NOEXEC_SEAL | MFD_EXEC) +static int check_sysctl_memfd_noexec(unsigned int *flags) +{ +#ifdef CONFIG_SYSCTL + char comm[TASK_COMM_LEN]; + int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; + struct pid_namespace *ns; + + ns = task_active_pid_ns(current); + if (ns) + sysctl = ns->memfd_noexec_scope; + + if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { + if (sysctl == MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) + *flags |= MFD_NOEXEC_SEAL; + else + *flags |= MFD_EXEC; + } + + if (*flags & MFD_EXEC && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { + pr_warn_once( + "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", + task_pid_nr(current), get_task_comm(comm, current)); + + return -EACCES; + } +#endif + + return 0; +} + SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) @@ -294,35 +324,14 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { -#ifdef CONFIG_SYSCTL - int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; - struct pid_namespace *ns; - - ns = task_active_pid_ns(current); - if (ns) - sysctl = ns->memfd_noexec_scope; - - switch (sysctl) { - case MEMFD_NOEXEC_SCOPE_EXEC: - flags |= MFD_EXEC; - break; - case MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL: - flags |= MFD_NOEXEC_SEAL; - break; - default: - pr_warn_once( - "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", - task_pid_nr(current), get_task_comm(comm, current)); - return -EINVAL; - } -#else - flags |= MFD_EXEC; -#endif pr_warn_once( "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", task_pid_nr(current), get_task_comm(comm, current)); } + if (check_sysctl_memfd_noexec(&flags) < 0) + return -EACCES; + /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); if (len <= 0) From badbbcd76545c58eff64bb1548f7f834a30dc52a Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Wed, 5 Jul 2023 06:33:15 +0000 Subject: [PATCH 051/489] selftests/memfd: sysctl: fix MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED Add selftest for sysctl vm.memfd_noexec is 2 (MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) memfd_create(.., MFD_EXEC) should fail in this case. Link: https://lkml.kernel.org/r/20230705063315.3680666-3-jeffxu@google.com Reported-by: Dominique Martinet Closes: https://lore.kernel.org/linux-mm/CABi2SkXUX_QqTQ10Yx9bBUGpN1wByOi_=gZU6WEy5a8MaQY3Jw@mail.gmail.com/T/ Signed-off-by: Jeff Xu Cc: Daniel Verkamp Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Kees Cook Cc: kernel test robot Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index dba0e8ba002f88..dbdd9ec5e3973f 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1147,6 +1147,11 @@ static void test_sysctl_child(void) sysctl_assert_write("2"); mfd_fail_new("kern_memfd_sysctl_2", MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_fail_new("kern_memfd_sysctl_2_MFD_EXEC", + MFD_CLOEXEC | MFD_EXEC); + fd = mfd_assert_new("", 0, MFD_NOEXEC_SEAL); + close(fd); + sysctl_fail_write("0"); sysctl_fail_write("1"); } From bded67f81ec47e6054ad24c1c7992a6523a9b2c6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Jul 2023 14:39:05 +0800 Subject: [PATCH 052/489] memory tier: rename destroy_memory_type() to put_memory_type() It appears that destroy_memory_type() isn't a very good name because we usually will not free the memory_type here. So rename it to a more appropriate name i.e. put_memory_type(). Link: https://lkml.kernel.org/r/20230706063905.543800-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Suggested-by: Huang, Ying Reviewed-by: "Huang, Ying" Reviewed-by: Xiao Yang Cc: Dan Williams Cc: Dave Jiang Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/dax/kmem.c | 4 ++-- include/linux/memory-tiers.h | 4 ++-- mm/memory-tiers.c | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c index 898ca95057547c..c57acb73e3db2a 100644 --- a/drivers/dax/kmem.c +++ b/drivers/dax/kmem.c @@ -264,7 +264,7 @@ static int __init dax_kmem_init(void) return rc; error_dax_driver: - destroy_memory_type(dax_slowmem_type); + put_memory_type(dax_slowmem_type); err_dax_slowmem_type: kfree_const(kmem_name); return rc; @@ -275,7 +275,7 @@ static void __exit dax_kmem_exit(void) dax_driver_unregister(&device_dax_kmem_driver); if (!any_hotremove_failed) kfree_const(kmem_name); - destroy_memory_type(dax_slowmem_type); + put_memory_type(dax_slowmem_type); } MODULE_AUTHOR("Intel Corporation"); diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h index fc9647b1b4f96a..437441cdf78fb6 100644 --- a/include/linux/memory-tiers.h +++ b/include/linux/memory-tiers.h @@ -33,7 +33,7 @@ struct memory_dev_type { #ifdef CONFIG_NUMA extern bool numa_demotion_enabled; struct memory_dev_type *alloc_memory_type(int adistance); -void destroy_memory_type(struct memory_dev_type *memtype); +void put_memory_type(struct memory_dev_type *memtype); void init_node_memory_type(int node, struct memory_dev_type *default_type); void clear_node_memory_type(int node, struct memory_dev_type *memtype); #ifdef CONFIG_MIGRATION @@ -68,7 +68,7 @@ static inline struct memory_dev_type *alloc_memory_type(int adistance) return NULL; } -static inline void destroy_memory_type(struct memory_dev_type *memtype) +static inline void put_memory_type(struct memory_dev_type *memtype) { } diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index 1719fa3bcf0244..c49ab03f49b1ed 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -560,11 +560,11 @@ struct memory_dev_type *alloc_memory_type(int adistance) } EXPORT_SYMBOL_GPL(alloc_memory_type); -void destroy_memory_type(struct memory_dev_type *memtype) +void put_memory_type(struct memory_dev_type *memtype) { kref_put(&memtype->kref, release_memtype); } -EXPORT_SYMBOL_GPL(destroy_memory_type); +EXPORT_SYMBOL_GPL(put_memory_type); void init_node_memory_type(int node, struct memory_dev_type *memtype) { @@ -586,7 +586,7 @@ void clear_node_memory_type(int node, struct memory_dev_type *memtype) */ if (!node_memory_types[node].map_count) { node_memory_types[node].memtype = NULL; - destroy_memory_type(memtype); + put_memory_type(memtype); } mutex_unlock(&memory_tier_lock); } From 8f21912a4bf854e51b3ba69298f559f976d63685 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 6 Jul 2023 17:24:41 +0800 Subject: [PATCH 053/489] mm: remove obsolete comment above struct per_cpu_pages Since commit 01b44456a7aa ("mm/page_alloc: replace local_lock with normal spinlock"), per_cpu_pages is protected by normal spinlock. Remove the obsolete comment as it's not that helpful. Link: https://lkml.kernel.org/r/20230706092441.1574950-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5e50b78d58ea68..4106fbc5b4b324 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -676,7 +676,6 @@ enum zone_watermarks { #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) -/* Fields and list protected by pagesets local_lock in page_alloc.c */ struct per_cpu_pages { spinlock_t lock; /* Protects lists field */ int count; /* number of pages in the list */ From 35fb4764c8b2cdd91820761eb03c18106d46b8ae Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Fri, 7 Jul 2023 00:03:34 +0530 Subject: [PATCH 054/489] mm: cma: print cma name as well in cma_alloc debug CMA allocation can happen either from global cma or from dedicated cma region. Thus it is helpful to print cma name as well during initial debugging to confirm cma regions were getting initialized or not. Link: https://lkml.kernel.org/r/1688668414-12350-1-git-send-email-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Signed-off-by: Pintu Agarwal Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index a4cfe995e11e78..4880f72102fa98 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -436,8 +436,8 @@ struct page *cma_alloc(struct cma *cma, unsigned long count, if (!cma || !cma->count || !cma->bitmap) goto out; - pr_debug("%s(cma %p, count %lu, align %d)\n", __func__, (void *)cma, - count, align); + pr_debug("%s(cma %p, name: %s, count %lu, align %d)\n", __func__, + (void *)cma, cma->name, count, align); if (!count) goto out; From dba438bd7663fefab870a6dd4b01ed0923c32d79 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Jul 2023 20:52:51 +0100 Subject: [PATCH 055/489] rmap: pass the folio to __page_check_anon_rmap() The lone caller already has the folio, so pass it in instead of deriving it from the page again. Link: https://lkml.kernel.org/r/20230706195251.2707542-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 0c0d8857dfce47..2668f5ea353428 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1175,14 +1175,14 @@ static void __page_set_anon_rmap(struct folio *folio, struct page *page, /** * __page_check_anon_rmap - sanity check anonymous rmap addition - * @page: the page to add the mapping to + * @folio: The folio containing @page. + * @page: the page to check the mapping of * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ -static void __page_check_anon_rmap(struct page *page, +static void __page_check_anon_rmap(struct folio *folio, struct page *page, struct vm_area_struct *vma, unsigned long address) { - struct folio *folio = page_folio(page); /* * The page's anon-rmap details (mapping and index) are guaranteed to * be set up correctly at this point. @@ -1262,7 +1262,7 @@ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, __page_set_anon_rmap(folio, page, vma, address, !!(flags & RMAP_EXCLUSIVE)); else - __page_check_anon_rmap(page, vma, address); + __page_check_anon_rmap(folio, page, vma, address); } mlock_vma_folio(folio, vma, compound); From 0201ebf274a306a6ebb95e5dc2d6a0a27c737cac Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:51 +0100 Subject: [PATCH 056/489] mm: merge folio_has_private()/filemap_release_folio() call pairs Patch series "mm, netfs, fscache: Stop read optimisation when folio removed from pagecache", v7. This fixes an optimisation in fscache whereby we don't read from the cache for a particular file until we know that there's data there that we don't have in the pagecache. The problem is that I'm no longer using PG_fscache (aka PG_private_2) to indicate that the page is cached and so I don't get a notification when a cached page is dropped from the pagecache. The first patch merges some folio_has_private() and filemap_release_folio() pairs and introduces a helper, folio_needs_release(), to indicate if a release is required. The second patch is the actual fix. Following Willy's suggestions[1], it adds an AS_RELEASE_ALWAYS flag to an address_space that will make filemap_release_folio() always call ->release_folio(), even if PG_private/PG_private_2 aren't set. folio_needs_release() is altered to add a check for this. This patch (of 2): Make filemap_release_folio() check folio_has_private(). Then, in most cases, where a call to folio_has_private() is immediately followed by a call to filemap_release_folio(), we can get rid of the test in the pair. There are a couple of sites in mm/vscan.c that this can't so easily be done. In shrink_folio_list(), there are actually three cases (something different is done for incompletely invalidated buffers), but filemap_release_folio() elides two of them. In shrink_active_list(), we don't have have the folio lock yet, so the check allows us to avoid locking the page unnecessarily. A wrapper function to check if a folio needs release is provided for those places that still need to do it in the mm/ directory. This will acquire additional parts to the condition in a future patch. After this, the only remaining caller of folio_has_private() outside of mm/ is a check in fuse. Link: https://lkml.kernel.org/r/20230628104852.3391651-1-dhowells@redhat.com Link: https://lkml.kernel.org/r/20230628104852.3391651-2-dhowells@redhat.com Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Signed-off-by: David Howells Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Xiubo Li Cc: Jingbo Xu Signed-off-by: Andrew Morton --- fs/ext4/move_extent.c | 12 ++++-------- fs/splice.c | 3 +-- mm/filemap.c | 2 ++ mm/huge_memory.c | 3 +-- mm/internal.h | 8 ++++++++ mm/khugepaged.c | 3 +-- mm/memory-failure.c | 8 +++----- mm/migrate.c | 3 +-- mm/truncate.c | 6 ++---- mm/vmscan.c | 8 ++++---- 10 files changed, 27 insertions(+), 29 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index f4b4861a74ee19..18a9e7c479754b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -340,10 +340,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } @@ -362,10 +360,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ - if ((folio_has_private(folio[0]) && - !filemap_release_folio(folio[0], 0)) || - (folio_has_private(folio[1]) && - !filemap_release_folio(folio[1], 0))) { + if (!filemap_release_folio(folio[0], 0) || + !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } diff --git a/fs/splice.c b/fs/splice.c index 3e2a31e1ce6a8f..0d3deeb3857e08 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -83,8 +83,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, */ folio_wait_writeback(folio); - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) goto out_unlock; /* diff --git a/mm/filemap.c b/mm/filemap.c index 93e495d2d47749..dd022b065614ba 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -4073,6 +4073,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) struct address_space * const mapping = folio->mapping; BUG_ON(!folio_test_locked(folio)); + if (!folio_needs_release(folio)) + return true; if (folio_test_writeback(folio)) return false; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index eb3678360b97ea..9f3109ed7351fb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2697,8 +2697,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) gfp = current_gfp_context(mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); - if (folio_test_private(folio) && - !filemap_release_folio(folio, gfp)) { + if (!filemap_release_folio(folio, gfp)) { ret = -EBUSY; goto out; } diff --git a/mm/internal.h b/mm/internal.h index 721ed07d7fd6f1..822b13de378068 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -176,6 +176,14 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } +/* + * Return true if a folio needs ->release_folio() calling upon it. + */ +static inline bool folio_needs_release(struct folio *folio) +{ + return folio_has_private(folio); +} + extern unsigned long highest_memmap_pfn; /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4b8b8673d5d9ff..4e707da4a83c78 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2078,8 +2078,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, goto out_unlock; } - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) { + if (!filemap_release_folio(folio, GFP_KERNEL)) { result = SCAN_PAGE_HAS_PRIVATE; folio_putback_lru(folio); goto out_unlock; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9d87f0b8b8057c..76da955bf10fb8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -936,14 +936,12 @@ static int truncate_error_page(struct page *p, unsigned long pfn, struct folio *folio = page_folio(p); int err = mapping->a_ops->error_remove_page(mapping, p); - if (err != 0) { + if (err != 0) pr_info("%#lx: Failed to punch page: %d\n", pfn, err); - } else if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_NOIO)) { + else if (!filemap_release_folio(folio, GFP_NOIO)) pr_info("%#lx: failed to release buffers\n", pfn); - } else { + else ret = MF_RECOVERED; - } } else { /* * If the file system doesn't support it just invalidate diff --git a/mm/migrate.c b/mm/migrate.c index 24baad2571e314..e9821e245e7036 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -922,8 +922,7 @@ static int fallback_migrate_folio(struct address_space *mapping, * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (folio_test_private(src) && - !filemap_release_folio(src, GFP_KERNEL)) + if (!filemap_release_folio(src, GFP_KERNEL)) return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY; return migrate_folio(mapping, dst, src, mode); diff --git a/mm/truncate.c b/mm/truncate.c index 2f28cc0e12ef1d..bd4fafd67f95b1 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -19,7 +19,6 @@ #include #include #include -#include /* grr. try_to_release_page */ #include #include #include "internal.h" @@ -276,7 +275,7 @@ static long mapping_evict_folio(struct address_space *mapping, if (folio_ref_count(folio) > folio_nr_pages(folio) + folio_has_private(folio) + 1) return 0; - if (folio_has_private(folio) && !filemap_release_folio(folio, 0)) + if (!filemap_release_folio(folio, 0)) return 0; return remove_mapping(mapping, folio); @@ -573,8 +572,7 @@ static int invalidate_complete_folio2(struct address_space *mapping, if (folio->mapping != mapping) return 0; - if (folio_has_private(folio) && - !filemap_release_folio(folio, GFP_KERNEL)) + if (!filemap_release_folio(folio, GFP_KERNEL)) return 0; spin_lock(&mapping->host->i_lock); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1080209a568bba..4039620d30fe4a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2064,7 +2064,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, * (refcount == 1) it can be freed. Otherwise, leave * the folio on the LRU so it is swappable. */ - if (folio_has_private(folio)) { + if (folio_needs_release(folio)) { if (!filemap_release_folio(folio, sc->gfp_mask)) goto activate_locked; if (!mapping && folio_ref_count(folio) == 1) { @@ -2729,9 +2729,9 @@ static void shrink_active_list(unsigned long nr_to_scan, } if (unlikely(buffer_heads_over_limit)) { - if (folio_test_private(folio) && folio_trylock(folio)) { - if (folio_test_private(folio)) - filemap_release_folio(folio, 0); + if (folio_needs_release(folio) && + folio_trylock(folio)) { + filemap_release_folio(folio, 0); folio_unlock(folio); } } From b4fa966f03b7401ceacd4ffd7227197afb2b8376 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 28 Jun 2023 11:48:52 +0100 Subject: [PATCH 057/489] mm, netfs, fscache: stop read optimisation when folio removed from pagecache Fscache has an optimisation by which reads from the cache are skipped until we know that (a) there's data there to be read and (b) that data isn't entirely covered by pages resident in the netfs pagecache. This is done with two flags manipulated by fscache_note_page_release(): if (... test_bit(FSCACHE_COOKIE_HAVE_DATA, &cookie->flags) && test_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags)) clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags); where the NO_DATA_TO_READ flag causes cachefiles_prepare_read() to indicate that netfslib should download from the server or clear the page instead. The fscache_note_page_release() function is intended to be called from ->releasepage() - but that only gets called if PG_private or PG_private_2 is set - and currently the former is at the discretion of the network filesystem and the latter is only set whilst a page is being written to the cache, so sometimes we miss clearing the optimisation. Fix this by following Willy's suggestion[1] and adding an address_space flag, AS_RELEASE_ALWAYS, that causes filemap_release_folio() to always call ->release_folio() if it's set, even if PG_private or PG_private_2 aren't set. Note that this would require folio_test_private() and page_has_private() to become more complicated. To avoid that, in the places[*] where these are used to conditionalise calls to filemap_release_folio() and try_to_release_page(), the tests are removed the those functions just jumped to unconditionally and the test is performed there. [*] There are some exceptions in vmscan.c where the check guards more than just a call to the releaser. I've added a function, folio_needs_release() to wrap all the checks for that. AS_RELEASE_ALWAYS should be set if a non-NULL cookie is obtained from fscache and cleared in ->evict_inode() before truncate_inode_pages_final() is called. Additionally, the FSCACHE_COOKIE_NO_DATA_TO_READ flag needs to be cleared and the optimisation cancelled if a cachefiles object already contains data when we open it. [dwysocha@redhat.com: call folio_mapping() inside folio_needs_release()] Link: https://github.com/DaveWysochanskiRH/kernel/commit/902c990e311120179fa5de99d68364b2947b79ec Link: https://lkml.kernel.org/r/20230628104852.3391651-3-dhowells@redhat.com Fixes: 1f67e6d0b188 ("fscache: Provide a function to note the release of a page") Fixes: 047487c947e8 ("cachefiles: Implement the I/O routines") Signed-off-by: David Howells Signed-off-by: Dave Wysochanski Reported-by: Rohith Surabattula Suggested-by: Matthew Wilcox Tested-by: SeongJae Park Cc: Daire Byrne Cc: Matthew Wilcox Cc: Linus Torvalds Cc: Steve French Cc: Shyam Prasad N Cc: Rohith Surabattula Cc: Dave Wysochanski Cc: Dominique Martinet Cc: Ilya Dryomov Cc: Andreas Dilger Cc: Jingbo Xu Cc: "Theodore Ts'o" Cc: Xiubo Li Signed-off-by: Andrew Morton --- fs/9p/cache.c | 2 ++ fs/afs/internal.h | 2 ++ fs/cachefiles/namei.c | 2 ++ fs/ceph/cache.c | 2 ++ fs/nfs/fscache.c | 3 +++ fs/smb/client/fscache.c | 2 ++ include/linux/pagemap.h | 16 ++++++++++++++++ mm/internal.h | 5 ++++- 8 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index cebba4eaa0b575..12c0ae29f1857c 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -68,6 +68,8 @@ void v9fs_cache_inode_get_cookie(struct inode *inode) &path, sizeof(path), &version, sizeof(version), i_size_read(&v9inode->netfs.inode)); + if (v9inode->netfs.cache) + mapping_set_release_always(inode->i_mapping); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", inode, v9fs_inode_cookie(v9inode)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9d3d64921106de..da73b97e19a9af 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -681,6 +681,8 @@ static inline void afs_vnode_set_cache(struct afs_vnode *vnode, { #ifdef CONFIG_AFS_FSCACHE vnode->netfs.cache = cookie; + if (cookie) + mapping_set_release_always(vnode->netfs.inode.i_mapping); #endif } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index d9d22d0ec38ad2..7bf7a5fcc045f8 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -585,6 +585,8 @@ static bool cachefiles_open_file(struct cachefiles_object *object, if (ret < 0) goto check_failed; + clear_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &object->cookie->flags); + object->file = file; /* Always update the atime on an object we've just looked up (this is diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 177d8e8d73fe42..de1dee46d3df72 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -36,6 +36,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) &ci->i_vino, sizeof(ci->i_vino), &ci->i_version, sizeof(ci->i_version), i_size_read(inode)); + if (ci->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info *ci) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index 8c35d88a84b194..b05717fe0d4e4f 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -180,6 +180,9 @@ void nfs_fscache_init_inode(struct inode *inode) &auxdata, /* aux_data */ sizeof(auxdata), i_size_read(inode)); + + if (netfs_inode(inode)->cache) + mapping_set_release_always(inode->i_mapping); } /* diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index 8f6909d633da8a..3677525ee99311 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -108,6 +108,8 @@ void cifs_fscache_get_inode_cookie(struct inode *inode) &cifsi->uniqueid, sizeof(cifsi->uniqueid), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); + if (cifsi->netfs.cache) + mapping_set_release_always(inode->i_mapping); } void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716953ee1ebdb2..0ab0f2362b9b7b 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -203,6 +203,7 @@ enum mapping_flags { /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_LARGE_FOLIO_SUPPORT = 6, + AS_RELEASE_ALWAYS, /* Call ->release_folio(), even if no private data */ }; /** @@ -273,6 +274,21 @@ static inline int mapping_use_writeback_tags(struct address_space *mapping) return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } +static inline bool mapping_release_always(const struct address_space *mapping) +{ + return test_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_set_release_always(struct address_space *mapping) +{ + set_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + +static inline void mapping_clear_release_always(struct address_space *mapping) +{ + clear_bit(AS_RELEASE_ALWAYS, &mapping->flags); +} + static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; diff --git a/mm/internal.h b/mm/internal.h index 822b13de378068..483add0bfb289d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -181,7 +181,10 @@ static inline void set_page_refcounted(struct page *page) */ static inline bool folio_needs_release(struct folio *folio) { - return folio_has_private(folio); + struct address_space *mapping = folio_mapping(folio); + + return folio_has_private(folio) || + (mapping && mapping_release_always(mapping)); } extern unsigned long highest_memmap_pfn; From 9651eeab3c5fceb45d06230bf0839337206450ad Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 7 Jul 2023 23:39:53 +0800 Subject: [PATCH 058/489] mm: correct stale comment of function check_pte Commit 2aff7a4755bed ("mm: Convert page_vma_mapped_walk to work on PFNs") replaced page with pfns in page_vma_mapped_walk structure and updated "@pvmw->page" to "@pvmw->pfn" in comment of function page_vma_mapped_walk. This patch update stale "page" to "pfn" in comment of check_pte. Link: https://lkml.kernel.org/r/20230707153953.1380615-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 49e0d28f037972..e0b368e545ed00 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -73,20 +73,22 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) } /** - * check_pte - check if @pvmw->page is mapped at the @pvmw->pte - * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking + * check_pte - check if [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) is + * mapped at the @pvmw->pte + * @pvmw: page_vma_mapped_walk struct, includes a pair pte and pfn range + * for checking * - * page_vma_mapped_walk() found a place where @pvmw->page is *potentially* + * page_vma_mapped_walk() found a place where pfn range is *potentially* * mapped. check_pte() has to validate this. * * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to * arbitrary page. * * If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration - * entry that points to @pvmw->page or any subpage in case of THP. + * entry that points to [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) * * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to - * pvmw->page or any subpage in case of THP. + * [pvmw->pfn, @pvmw->pfn + @pvmw->nr_pages) * * Otherwise, return false. * From 809ef83ccb61fedc951eccf876a327e940bc412a Mon Sep 17 00:00:00 2001 From: Yang Li Date: Fri, 7 Jul 2023 17:00:34 +0800 Subject: [PATCH 059/489] mm: fix some kernel-doc comments Add description of @mm_wr_locked and @mm. to silence the warnings: mm/memory.c:1716: warning: Function parameter or member 'mm_wr_locked' not described in 'unmap_vmas' mm/memory.c:5110: warning: Function parameter or member 'mm' not described in 'mm_account_fault' Link: https://lkml.kernel.org/r/20230707090034.125511-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Signed-off-by: Andrew Morton --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 5f863b1a0edc64..7fb87a9c025af1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1697,6 +1697,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping + * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. * @@ -5084,7 +5085,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, /** * mm_account_fault - Do page fault accounting - * + * @mm: mm from which memcg should be extracted. It can be NULL. * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting * of perf event counters, but we'll still do the per-task accounting to * the task who triggered this page fault. From 94ec20035b05f842dc08277a5a90fba757088f39 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Jul 2023 16:51:46 +0800 Subject: [PATCH 060/489] mm: compaction: use the correct type of list for free pages Use the page->buddy_list instead of page->lru to clarify the correct type of list for free pages. Link: https://lkml.kernel.org/r/b21cd8e2e32b9a1d9bc9e43ebf8acaf35e87f8df.1688715750.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Acked-by: David Hildenbrand Cc: Huang, Ying Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index dbc9f86b19343f..43358efdbdc200 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1500,7 +1500,7 @@ static void fast_isolate_freepages(struct compact_control *cc) spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; - list_for_each_entry_reverse(freepage, freelist, lru) { + list_for_each_entry_reverse(freepage, freelist, buddy_list) { unsigned long pfn; order_scanned++; @@ -1883,7 +1883,7 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) spin_lock_irqsave(&cc->zone->lock, flags); freelist = &area->free_list[MIGRATE_MOVABLE]; - list_for_each_entry(freepage, freelist, lru) { + list_for_each_entry(freepage, freelist, buddy_list) { unsigned long free_pfn; if (nr_scanned++ >= limit) { From e6e0c7673012f42c6fb8d89af71cd7607c93e0a5 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 7 Jul 2023 16:51:47 +0800 Subject: [PATCH 061/489] mm: compaction: skip the memory hole rapidly when isolating free pages Just like commit 9721fd82351d ("mm: compaction: skip memory hole rapidly when isolating migratable pages"), I can see it will also take more time to skip the larger memory hole (range: 0x1000000000 - 0x1800000000) when isolating free pages on my machine with below memory layout. So like commit 9721fd82351d, adding a new helper to skip the memory hole rapidly, which can reduce the time consumed from about 70us to less than 1us. [ 0.000000] Zone ranges: [ 0.000000] DMA [mem 0x0000000040000000-0x00000000ffffffff] [ 0.000000] DMA32 empty [ 0.000000] Normal [mem 0x0000000100000000-0x0000001fa7ffffff] [ 0.000000] Movable zone start for each node [ 0.000000] Early memory node ranges [ 0.000000] node 0: [mem 0x0000000040000000-0x0000000fffffffff] [ 0.000000] node 0: [mem 0x0000001800000000-0x0000001fa3c7ffff] [ 0.000000] node 0: [mem 0x0000001fa3c80000-0x0000001fa3ffffff] [ 0.000000] node 0: [mem 0x0000001fa4000000-0x0000001fa402ffff] [ 0.000000] node 0: [mem 0x0000001fa4030000-0x0000001fa40effff] [ 0.000000] node 0: [mem 0x0000001fa40f0000-0x0000001fa73cffff] [ 0.000000] node 0: [mem 0x0000001fa73d0000-0x0000001fa745ffff] [ 0.000000] node 0: [mem 0x0000001fa7460000-0x0000001fa746ffff] [ 0.000000] node 0: [mem 0x0000001fa7470000-0x0000001fa758ffff] [ 0.000000] node 0: [mem 0x0000001fa7590000-0x0000001fa7ffffff] [shikemeng@huaweicloud.com: avoid missing last page block in section after skip offline sections] Link: https://lkml.kernel.org/r/20230804110454.2935878-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230804110454.2935878-2-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/d2ba7e41ee566309b594311207ffca736375fc16.1688715750.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: "Huang, Ying" Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/compaction.c | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 43358efdbdc200..02239abed6ce40 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -249,11 +249,36 @@ static unsigned long skip_offline_sections(unsigned long start_pfn) return 0; } + +/* + * If the PFN falls into an offline section, return the end PFN of the + * next online section in reverse. If the PFN falls into an online section + * or if there is no next online section in reverse, return 0. + */ +static unsigned long skip_offline_sections_reverse(unsigned long start_pfn) +{ + unsigned long start_nr = pfn_to_section_nr(start_pfn); + + if (!start_nr || online_section_nr(start_nr)) + return 0; + + while (start_nr-- > 0) { + if (online_section_nr(start_nr)) + return section_nr_to_pfn(start_nr) + PAGES_PER_SECTION; + } + + return 0; +} #else static unsigned long skip_offline_sections(unsigned long start_pfn) { return 0; } + +static unsigned long skip_offline_sections_reverse(unsigned long start_pfn) +{ + return 0; +} #endif /* @@ -1668,8 +1693,15 @@ static void isolate_freepages(struct compact_control *cc) page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn, zone); - if (!page) + if (!page) { + unsigned long next_pfn; + + next_pfn = skip_offline_sections_reverse(block_start_pfn); + if (next_pfn) + block_start_pfn = max(next_pfn, low_pfn); + continue; + } /* Check the block is suitable for migration */ if (!suitable_migration_target(cc, page)) From c200a7119bc7dc9430e8287563e5343b154ff9d0 Mon Sep 17 00:00:00 2001 From: liuq Date: Fri, 7 Jul 2023 14:05:01 +0800 Subject: [PATCH 062/489] mm/sparse: remove redundant judgments from macro for_each_present_section_nr next_present_section_nr() has already ensured that 'section_nr<=__highest_present_section_nr', so this check is removed. Link: https://lkml.kernel.org/r/20230707060501.29184-1-liuq131@chinatelecom.cn Signed-off-by: liuq Signed-off-by: Andrew Morton --- mm/sparse.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 297a8b772e8dca..77d91e565045ca 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -172,8 +172,7 @@ static void __section_mark_present(struct mem_section *ms, #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ - ((section_nr != -1) && \ - (section_nr <= __highest_present_section_nr)); \ + section_nr != -1; \ section_nr = next_present_section_nr(section_nr)) static inline unsigned long first_present_section_nr(void) From 3ce2c24cb68f228590a053d6058a5901cd31af61 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 7 Jul 2023 11:38:59 +0800 Subject: [PATCH 063/489] mm: hugetlb_vmemmap: fix a race between vmemmap pmd split The local variable @page in __split_vmemmap_huge_pmd() to obtain a pmd page without holding page_table_lock may possiblely get the page table page instead of a huge pmd page. The effect may be in set_pte_at() since we may pass an invalid page struct, if set_pte_at() wants to access the page struct (e.g. CONFIG_PAGE_TABLE_CHECK is enabled), it may crash the kernel. So fix it. And inline __split_vmemmap_huge_pmd() since it only has one user. Link: https://lkml.kernel.org/r/20230707033859.16148-1-songmuchun@bytedance.com Fixes: d8d55f5616cf ("mm: sparsemem: use page table lock to protect kernel pmd operations") Signed-off-by: Muchun Song Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index c2007ef5e9b056..4b9734777f698f 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -36,14 +36,22 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; }; -static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; unsigned long addr = start; - struct page *page = pmd_page(*pmd); - pte_t *pgtable = pte_alloc_one_kernel(&init_mm); + struct page *head; + pte_t *pgtable; + + spin_lock(&init_mm.page_table_lock); + head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; + spin_unlock(&init_mm.page_table_lock); + if (!head) + return 0; + + pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; @@ -53,7 +61,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; - entry = mk_pte(page + i, pgprot); + entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } @@ -65,8 +73,8 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) * be treated as indepdenent small pages (as they can be freed * individually). */ - if (!PageReserved(page)) - split_page(page, get_order(PMD_SIZE)); + if (!PageReserved(head)) + split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); @@ -80,20 +88,6 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) return 0; } -static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) -{ - int leaf; - - spin_lock(&init_mm.page_table_lock); - leaf = pmd_leaf(*pmd); - spin_unlock(&init_mm.page_table_lock); - - if (!leaf) - return 0; - - return __split_vmemmap_huge_pmd(pmd, start); -} - static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) From 3d243659d94fd6d521c4573ec467bacef911ccb3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:44 -0700 Subject: [PATCH 064/489] mm/memory: convert do_page_mkwrite() to use folios Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7fb87a9c025af1..5209f3d80948ad 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2933,7 +2933,7 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { vm_fault_t ret; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; @@ -2948,14 +2948,14 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { - lock_page(page); - if (!page->mapping) { - unlock_page(page); + folio_lock(folio); + if (!folio->mapping) { + folio_unlock(folio); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); return ret; } From 5a97858b51658ccb1a20a3273eb9fedf8fcef6a5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:45 -0700 Subject: [PATCH 065/489] mm/memory: convert wp_page_shared() to use folios Saves six implicit calls to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 5209f3d80948ad..a88e57d927bd25 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3283,13 +3283,13 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) return 0; } -static vm_fault_t wp_page_shared(struct vm_fault *vmf) +static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; - get_page(vmf->page); + folio_get(folio); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; @@ -3298,21 +3298,21 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(vmf->page); + folio_put(folio); return tmp; } tmp = finish_mkwrite_fault(vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { - unlock_page(vmf->page); - put_page(vmf->page); + folio_unlock(folio); + folio_put(folio); return tmp; } } else { wp_page_reuse(vmf); - lock_page(vmf->page); + folio_lock(folio); } ret |= fault_dirty_shared_page(vmf); - put_page(vmf->page); + folio_put(folio); return ret; } @@ -3363,6 +3363,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); + if (vmf->page) + folio = page_folio(vmf->page); + /* * Shared mapping: we are guaranteed to have VM_WRITE and * FAULT_FLAG_WRITE set at this point. @@ -3377,12 +3380,9 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) */ if (!vmf->page) return wp_pfn_shared(vmf); - return wp_page_shared(vmf); + return wp_page_shared(vmf, folio); } - if (vmf->page) - folio = page_folio(vmf->page); - /* * Private mapping: create an exclusive anonymous page copy if reuse * is impossible. We might miss VM_WRITE for FOLL_FORCE handling. From 6f609b7e37dff1e8b2261e93da8e2e9848d5513c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:46 -0700 Subject: [PATCH 066/489] mm/memory: convert do_shared_fault() to folios Saves three implicit calls to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a88e57d927bd25..7bebd6909199ae 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4602,21 +4602,24 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; + struct folio *folio; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; + folio = page_folio(vmf->page); + /* * Check if the backing address space wants to know that the page is * about to become writable */ if (vma->vm_ops->page_mkwrite) { - unlock_page(vmf->page); + folio_unlock(folio); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { - put_page(vmf->page); + folio_put(folio); return tmp; } } @@ -4624,8 +4627,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { - unlock_page(vmf->page); - put_page(vmf->page); + folio_unlock(folio); + folio_put(folio); return ret; } From 22d1e68f5a23f8b068da77af6d037bc73748c6e3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 6 Jul 2023 09:38:47 -0700 Subject: [PATCH 067/489] mm/memory: convert do_read_fault() to use folios Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230706163847.403202-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: ZhangPeng Signed-off-by: Andrew Morton --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 7bebd6909199ae..ff19719da03261 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4536,6 +4536,7 @@ static inline bool should_fault_around(struct vm_fault *vmf) static vm_fault_t do_read_fault(struct vm_fault *vmf) { vm_fault_t ret = 0; + struct folio *folio; /* * Let's call ->map_pages() first and use ->fault() as fallback @@ -4553,9 +4554,10 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; ret |= finish_fault(vmf); - unlock_page(vmf->page); + folio = page_folio(vmf->page); + folio_unlock(folio); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - put_page(vmf->page); + folio_put(folio); return ret; } From 60b1e24ce8c3334d9204d6229356b750632136be Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 8 Jul 2023 10:33:04 +0800 Subject: [PATCH 068/489] mm/memcg: minor cleanup for MEM_CGROUP_ID_MAX MEM_CGROUP_ID_MAX is only used when CONFIG_MEMCG is configured. So remove unneeded !CONFIG_MEMCG variant. Also it's only used in mem_cgroup_alloc(), so move it from memcontrol.h to memcontrol.c. And further define it as: #define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) so if someone changes MEM_CGROUP_ID_SHIFT in the future, then MEM_CGROUP_ID_MAX will be updated accordingly, as suggested by Muchun. Link: https://lkml.kernel.org/r/20230708023304.1184111-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 -- mm/memcontrol.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5818af8eca5a53..58eb7ca6569952 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -61,7 +61,6 @@ struct mem_cgroup_reclaim_cookie { #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 -#define MEM_CGROUP_ID_MAX USHRT_MAX struct mem_cgroup_id { int id; @@ -1158,7 +1157,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 -#define MEM_CGROUP_ID_MAX 0 static inline struct mem_cgroup *folio_memcg(struct folio *folio) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab99503c9ff2d0..3eaeb69ef9f5f9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5155,6 +5155,7 @@ static struct cftype mem_cgroup_legacy_files[] = { * those references are manageable from userspace. */ +#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1) static DEFINE_IDR(mem_cgroup_idr); static void mem_cgroup_id_remove(struct mem_cgroup *memcg) From af19487f00f34ff8643921d7909dbb3fedc7e329 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:33 -0700 Subject: [PATCH 069/489] mm: make PTE_MARKER_SWAPIN_ERROR more general Patch series "add UFFDIO_POISON to simulate memory poisoning with UFFD", v4. This series adds a new userfaultfd feature, UFFDIO_POISON. See commit 4 for a detailed description of the feature. This patch (of 8): Future patches will reuse PTE_MARKER_SWAPIN_ERROR to implement UFFDIO_POISON, so make some various preparations for that: First, rename it to just PTE_MARKER_POISONED. The "SWAPIN" can be confusing since we're going to re-use it for something not really related to swap. This can be particularly confusing for things like hugetlbfs, which doesn't support swap whatsoever. Also rename some various helper functions. Next, fix pte marker copying for hugetlbfs. Previously, it would WARN on seeing a PTE_MARKER_SWAPIN_ERROR, since hugetlbfs doesn't support swap. But, since we're going to re-use it, we want it to go ahead and copy it just like non-hugetlbfs memory does today. Since the code to do this is more complicated now, pull it out into a helper which can be re-used in both places. While we're at it, also make it slightly more explicit in its handling of e.g. uffd wp markers. For non-hugetlbfs page faults, instead of returning VM_FAULT_SIGBUS for an error entry, return VM_FAULT_HWPOISON. For most cases this change doesn't matter, e.g. a userspace program would receive a SIGBUS either way. But for UFFDIO_POISON, this change will let KVM guests get an MCE out of the box, instead of giving a SIGBUS to the hypervisor and requiring it to somehow inject an MCE. Finally, for hugetlbfs faults, handle PTE_MARKER_POISONED, and return VM_FAULT_HWPOISON_LARGE in such cases. Note that this can't happen today because the lack of swap support means we'll never end up with such a PTE anyway, but this behavior will be needed once such entries *can* show up via UFFDIO_POISON. Link: https://lkml.kernel.org/r/20230707215540.2324998-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230707215540.2324998-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 19 +++++++++++++++++++ include/linux/swapops.h | 15 ++++++++++----- mm/hugetlb.c | 32 +++++++++++++++++++++----------- mm/madvise.c | 2 +- mm/memory.c | 15 +++++++++------ mm/mprotect.c | 4 ++-- mm/shmem.c | 4 ++-- mm/swapfile.c | 2 +- 8 files changed, 65 insertions(+), 28 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 21d6c72bcc71e3..a86c8460078776 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -523,6 +523,25 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +/* + * Computes the pte marker to copy from the given source entry into dst_vma. + * If no marker should be copied, returns 0. + * The caller should insert a new pte created with make_pte_marker(). + */ +static inline pte_marker copy_pte_marker( + swp_entry_t entry, struct vm_area_struct *dst_vma) +{ + pte_marker srcm = pte_marker_get(entry); + /* Always copy error entries. */ + pte_marker dstm = srcm & PTE_MARKER_POISONED; + + /* Only copy PTE markers if UFFD register matches. */ + if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) + dstm |= PTE_MARKER_UFFD_WP; + + return dstm; +} + /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 4c932cb45e0b41..bff1e8d97de0e0 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -393,7 +393,12 @@ static inline bool is_migration_entry_dirty(swp_entry_t entry) typedef unsigned long pte_marker; #define PTE_MARKER_UFFD_WP BIT(0) -#define PTE_MARKER_SWAPIN_ERROR BIT(1) +/* + * "Poisoned" here is meant in the very general sense of "future accesses are + * invalid", instead of referring very specifically to hardware memory errors. + * This marker is meant to represent any of various different causes of this. + */ +#define PTE_MARKER_POISONED BIT(1) #define PTE_MARKER_MASK (BIT(2) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) @@ -421,15 +426,15 @@ static inline pte_t make_pte_marker(pte_marker marker) return swp_entry_to_pte(make_pte_marker_entry(marker)); } -static inline swp_entry_t make_swapin_error_entry(void) +static inline swp_entry_t make_poisoned_swp_entry(void) { - return make_pte_marker_entry(PTE_MARKER_SWAPIN_ERROR); + return make_pte_marker_entry(PTE_MARKER_POISONED); } -static inline int is_swapin_error_entry(swp_entry_t entry) +static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && - (pte_marker_get(entry) & PTE_MARKER_SWAPIN_ERROR); + (pte_marker_get(entry) & PTE_MARKER_POISONED); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e3839eee465790..ffee2978dfed4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -5101,15 +5102,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry); } else if (unlikely(is_pte_marker(entry))) { - /* No swap on hugetlb */ - WARN_ON_ONCE( - is_swapin_error_entry(pte_to_swp_entry(entry))); - /* - * We copy the pte marker only if the dst vma has - * uffd-wp enabled. - */ - if (userfaultfd_wp(dst_vma)) - set_huge_pte_at(dst, addr, dst_pte, entry); + pte_marker marker = copy_pte_marker( + pte_to_swp_entry(entry), dst_vma); + + if (marker) + set_huge_pte_at(dst, addr, dst_pte, + make_pte_marker(marker)); } else { entry = huge_ptep_get(src_pte); pte_folio = page_folio(pte_page(entry)); @@ -6089,14 +6087,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_ptep_get(ptep); - /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) + if (huge_pte_none_mostly(entry)) { + if (is_pte_marker(entry)) { + pte_marker marker = + pte_marker_get(pte_to_swp_entry(entry)); + + if (marker & PTE_MARKER_POISONED) { + ret = VM_FAULT_HWPOISON_LARGE; + goto out_mutex; + } + } + /* + * Other PTE markers should be handled the same way as none PTE. + * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); + } ret = 0; diff --git a/mm/madvise.c b/mm/madvise.c index 05f97038eac3da..da65f8bd9ac33b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -664,7 +664,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, free_swap_and_cache(entry); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } else if (is_hwpoison_entry(entry) || - is_swapin_error_entry(entry)) { + is_poisoned_swp_entry(entry)) { pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } continue; diff --git a/mm/memory.c b/mm/memory.c index ff19719da03261..36b164ee9ffb0b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -860,8 +860,11 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, return -EBUSY; return -ENOENT; } else if (is_pte_marker_entry(entry)) { - if (is_swapin_error_entry(entry) || userfaultfd_wp(dst_vma)) - set_pte_at(dst_mm, addr, dst_pte, pte); + pte_marker marker = copy_pte_marker(entry, dst_vma); + + if (marker) + set_pte_at(dst_mm, addr, dst_pte, + make_pte_marker(marker)); return 0; } if (!userfaultfd_wp(dst_vma)) @@ -1502,7 +1505,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, !zap_drop_file_uffd_wp(details)) continue; } else if (is_hwpoison_entry(entry) || - is_swapin_error_entry(entry)) { + is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) continue; } else { @@ -3651,7 +3654,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf) * none pte. Otherwise it means the pte could have changed, so retry. * * This should also cover the case where e.g. the pte changed - * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR. + * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_POISONED. * So is_pte_marker() check is not enough to safely drop the pte. */ if (pte_same(vmf->orig_pte, ptep_get(vmf->pte))) @@ -3697,8 +3700,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) return VM_FAULT_SIGBUS; /* Higher priority than uffd-wp when data corrupted */ - if (marker & PTE_MARKER_SWAPIN_ERROR) - return VM_FAULT_SIGBUS; + if (marker & PTE_MARKER_POISONED) + return VM_FAULT_HWPOISON; if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); diff --git a/mm/mprotect.c b/mm/mprotect.c index 6f658d48370478..5c3112d9246648 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -230,10 +230,10 @@ static long change_pte_range(struct mmu_gather *tlb, newpte = pte_swp_mkuffd_wp(newpte); } else if (is_pte_marker_entry(entry)) { /* - * Ignore swapin errors unconditionally, + * Ignore error swap entries unconditionally, * because any access should sigbus anyway. */ - if (is_swapin_error_entry(entry)) + if (is_poisoned_swp_entry(entry)) continue; /* * If this is uffd-wp pte marker and we'd like diff --git a/mm/shmem.c b/mm/shmem.c index 8dfd72bdc86ab8..235f2b2fd20251 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1707,7 +1707,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, swp_entry_t swapin_error; void *old; - swapin_error = make_swapin_error_entry(); + swapin_error = make_poisoned_swp_entry(); old = xa_cmpxchg_irq(&mapping->i_pages, index, swp_to_radix_entry(swap), swp_to_radix_entry(swapin_error), 0); @@ -1752,7 +1752,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, swap = radix_to_swp_entry(*foliop); *foliop = NULL; - if (is_swapin_error_entry(swap)) + if (is_poisoned_swp_entry(swap)) return -EIO; si = get_swap_device(swap); diff --git a/mm/swapfile.c b/mm/swapfile.c index d996c335fc3c2a..346e22b8ae970c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1771,7 +1771,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, swp_entry = make_hwpoison_entry(swapcache); page = swapcache; } else { - swp_entry = make_swapin_error_entry(); + swp_entry = make_poisoned_swp_entry(); } new_pte = swp_entry_to_pte(swp_entry); ret = 0; From f92cedfa39ef208c9685d2fdd8215bb58178571b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 8 Jul 2023 18:03:23 -0700 Subject: [PATCH 070/489] mm-make-pte_marker_swapin_error-more-general-fix fix CONFIG_MMU=n build Cc: Axel Rasmussen Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index a86c8460078776..8148b30a9df108 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -523,6 +523,7 @@ static inline bool mm_tlb_flush_nested(struct mm_struct *mm) return atomic_read(&mm->tlb_flush_pending) > 1; } +#ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. @@ -541,6 +542,7 @@ static inline pte_marker copy_pte_marker( return dstm; } +#endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to From 2ef5d7245d9cb86c96c2a881b000834aa929a915 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:34 -0700 Subject: [PATCH 071/489] mm: userfaultfd: check for start + len overflow in validate_range Most userfaultfd ioctls take a `start + len` range as an argument. We have the validate_range helper to check that such ranges are valid. However, some (but not all!) ioctls *also* check that `start + len` doesn't wrap around (overflow). Just check for this in validate_range. This saves some repetitive code, and adds the check to some ioctls which weren't bothering to check for it before. [axelrasmussen@google.com: call validate_range() on the src range too] Link: https://lkml.kernel.org/r/20230714182932.2608735-1-axelrasmussen@google.com [axelrasmussen@google.com: fix src/dst validation] Link: https://lkml.kernel.org/r/20230810192128.1855570-1-axelrasmussen@google.com Link: https://lkml.kernel.org/r/20230707215540.2324998-3-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index ae711f1d7a8308..c2ed7dcf494ee0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1289,13 +1289,11 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, __wake_userfault(ctx, range); } -static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) +static __always_inline int validate_unaligned_range( + struct mm_struct *mm, __u64 start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) - return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) @@ -1306,9 +1304,20 @@ static __always_inline int validate_range(struct mm_struct *mm, return -EINVAL; if (len > task_size - start) return -EINVAL; + if (start + len <= start) + return -EINVAL; return 0; } +static __always_inline int validate_range(struct mm_struct *mm, + __u64 start, __u64 len) +{ + if (start & ~PAGE_MASK) + return -EINVAL; + + return validate_unaligned_range(mm, start, len); +} + static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long arg) { @@ -1757,17 +1766,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; + ret = validate_unaligned_range(ctx->mm, uffdio_copy.src, + uffdio_copy.len); + if (ret) + goto out; ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; - /* - * double check for wraparound just in case. copy_from_user() - * will later check uffdio_copy.src + uffdio_copy.len to fit - * in the userland range. - */ + ret = -EINVAL; - if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src) - goto out; if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) @@ -1927,11 +1934,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) goto out; ret = -EINVAL; - /* double check for wraparound just in case. */ - if (uffdio_continue.range.start + uffdio_continue.range.len <= - uffdio_continue.range.start) { - goto out; - } if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE | UFFDIO_CONTINUE_MODE_WP)) goto out; From 435cdb41a76fcfa5d6af7e0e39bb8ab5ef4b7a64 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:35 -0700 Subject: [PATCH 072/489] mm: userfaultfd: extract file size check out into a helper This code is already duplicated twice, and UFFDIO_POISON will do the same check a third time. So, it's worth extracting into a helper to save repetitive lines of code. Link: https://lkml.kernel.org/r/20230707215540.2324998-4-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Reviewed-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a2bf37ee276d6b..4244ca7ee903a9 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -45,6 +45,22 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm, return dst_vma; } +/* Check if dst_addr is outside of file's size. Must be called with ptl held. */ +static bool mfill_file_over_size(struct vm_area_struct *dst_vma, + unsigned long dst_addr) +{ + struct inode *inode; + pgoff_t offset, max_off; + + if (!dst_vma->vm_file) + return false; + + inode = dst_vma->vm_file->f_inode; + offset = linear_page_index(dst_vma, dst_addr); + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + return offset >= max_off; +} + /* * Install PTEs, to map dst_addr (within dst_vma) to page. * @@ -64,8 +80,6 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, bool page_in_cache = page_mapping(page); spinlock_t *ptl; struct folio *folio; - struct inode *inode; - pgoff_t offset, max_off; _dst_pte = mk_pte(page, dst_vma->vm_page_prot); _dst_pte = pte_mkdirty(_dst_pte); @@ -81,14 +95,9 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd, if (!dst_pte) goto out; - if (vma_is_shmem(dst_vma)) { - /* serialize against truncate with the page table lock */ - inode = dst_vma->vm_file->f_inode; - offset = linear_page_index(dst_vma, dst_addr); - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; - if (unlikely(offset >= max_off)) - goto out_unlock; + goto out_unlock; } ret = -EEXIST; @@ -211,8 +220,6 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, pte_t _dst_pte, *dst_pte; spinlock_t *ptl; int ret; - pgoff_t offset, max_off; - struct inode *inode; _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr), dst_vma->vm_page_prot)); @@ -220,14 +227,9 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd, dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl); if (!dst_pte) goto out; - if (dst_vma->vm_file) { - /* the shmem MAP_PRIVATE case requires checking the i_size */ - inode = dst_vma->vm_file->f_inode; - offset = linear_page_index(dst_vma, dst_addr); - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; - if (unlikely(offset >= max_off)) - goto out_unlock; + goto out_unlock; } ret = -EEXIST; if (!pte_none(ptep_get(dst_pte))) From fc71884a5f599a603fcc3c2b28b3872c09d19c18 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:36 -0700 Subject: [PATCH 073/489] mm: userfaultfd: add new UFFDIO_POISON ioctl The basic idea here is to "simulate" memory poisoning for VMs. A VM running on some host might encounter a memory error, after which some page(s) are poisoned (i.e., future accesses SIGBUS). They expect that once poisoned, pages can never become "un-poisoned". So, when we live migrate the VM, we need to preserve the poisoned status of these pages. When live migrating, we try to get the guest running on its new host as quickly as possible. So, we start it running before all memory has been copied, and before we're certain which pages should be poisoned or not. So the basic way to use this new feature is: - On the new host, the guest's memory is registered with userfaultfd, in either MISSING or MINOR mode (doesn't really matter for this purpose). - On any first access, we get a userfaultfd event. At this point we can communicate with the old host to find out if the page was poisoned. - If so, we can respond with a UFFDIO_POISON - this places a swap marker so any future accesses will SIGBUS. Because the pte is now "present", future accesses won't generate more userfaultfd events, they'll just SIGBUS directly. UFFDIO_POISON does not handle unmapping previously-present PTEs. This isn't needed, because during live migration we want to intercept all accesses with userfaultfd (not just writes, so WP mode isn't useful for this). So whether minor or missing mode is being used (or both), the PTE won't be present in any case, so handling that case isn't needed. Similarly, UFFDIO_POISON won't replace existing PTE markers. This might be okay to do, but it seems to be safer to just refuse to overwrite any existing entry (like a UFFD_WP PTE marker). Link: https://lkml.kernel.org/r/20230707215540.2324998-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 58 ++++++++++++++++++++++++++++++++ include/linux/userfaultfd_k.h | 4 +++ include/uapi/linux/userfaultfd.h | 16 +++++++++ mm/userfaultfd.c | 48 +++++++++++++++++++++++++- 4 files changed, 125 insertions(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index c2ed7dcf494ee0..9854d44ae18ecc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1967,6 +1967,61 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) return ret; } +static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long arg) +{ + __s64 ret; + struct uffdio_poison uffdio_poison; + struct uffdio_poison __user *user_uffdio_poison; + struct userfaultfd_wake_range range; + + user_uffdio_poison = (struct uffdio_poison __user *)arg; + + ret = -EAGAIN; + if (atomic_read(&ctx->mmap_changing)) + goto out; + + ret = -EFAULT; + if (copy_from_user(&uffdio_poison, user_uffdio_poison, + /* don't copy the output fields */ + sizeof(uffdio_poison) - (sizeof(__s64)))) + goto out; + + ret = validate_range(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len); + if (ret) + goto out; + + ret = -EINVAL; + if (uffdio_poison.mode & ~UFFDIO_POISON_MODE_DONTWAKE) + goto out; + + if (mmget_not_zero(ctx->mm)) { + ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start, + uffdio_poison.range.len, + &ctx->mmap_changing, 0); + mmput(ctx->mm); + } else { + return -ESRCH; + } + + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; + if (ret < 0) + goto out; + + /* len == 0 would wake all */ + BUG_ON(!ret); + range.len = ret; + if (!(uffdio_poison.mode & UFFDIO_POISON_MODE_DONTWAKE)) { + range.start = uffdio_poison.range.start; + wake_userfault(ctx, &range); + } + ret = range.len == uffdio_poison.range.len ? 0 : -EAGAIN; + +out: + return ret; +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2068,6 +2123,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, case UFFDIO_CONTINUE: ret = userfaultfd_continue(ctx, arg); break; + case UFFDIO_POISON: + ret = userfaultfd_poison(ctx, arg); + break; } return ret; } diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac7b0c96d351e4..ac8c6854097cd7 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -46,6 +46,7 @@ enum mfill_atomic_mode { MFILL_ATOMIC_COPY, MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, + MFILL_ATOMIC_POISON, NR_MFILL_ATOMIC_MODES, }; @@ -83,6 +84,9 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long len, atomic_t *mmap_changing, uffd_flags_t flags); +extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags); extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 66dd4cd277bd6a..b5f07eacc697ff 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -71,6 +71,7 @@ #define _UFFDIO_ZEROPAGE (0x04) #define _UFFDIO_WRITEPROTECT (0x06) #define _UFFDIO_CONTINUE (0x07) +#define _UFFDIO_POISON (0x08) #define _UFFDIO_API (0x3F) /* userfaultfd ioctl ids */ @@ -91,6 +92,8 @@ struct uffdio_writeprotect) #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ struct uffdio_continue) +#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \ + struct uffdio_poison) /* read() structure */ struct uffd_msg { @@ -225,6 +228,7 @@ struct uffdio_api { #define UFFD_FEATURE_EXACT_ADDRESS (1<<11) #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) +#define UFFD_FEATURE_POISON (1<<14) __u64 features; __u64 ioctls; @@ -321,6 +325,18 @@ struct uffdio_continue { __s64 mapped; }; +struct uffdio_poison { + struct uffdio_range range; +#define UFFDIO_POISON_MODE_DONTWAKE ((__u64)1<<0) + __u64 mode; + + /* + * Fields below here are written by the ioctl and must be at the end: + * the copy_from_user will not read past here. + */ + __s64 updated; +}; + /* * Flags for the userfaultfd(2) system call itself. */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 4244ca7ee903a9..68157359dc3440 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -288,6 +288,40 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd, goto out; } +/* Handles UFFDIO_POISON for all non-hugetlb VMAs. */ +static int mfill_atomic_pte_poison(pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + uffd_flags_t flags) +{ + int ret; + struct mm_struct *dst_mm = dst_vma->vm_mm; + pte_t _dst_pte, *dst_pte; + spinlock_t *ptl; + + _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + + if (mfill_file_over_size(dst_vma, dst_addr)) { + ret = -EFAULT; + goto out_unlock; + } + + ret = -EEXIST; + /* Refuse to overwrite any PTE, even a PTE marker (e.g. UFFD WP). */ + if (!pte_none(*dst_pte)) + goto out_unlock; + + set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + ret = 0; +out_unlock: + pte_unmap_unlock(dst_pte, ptl); + return ret; +} + static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; @@ -339,7 +373,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) || + uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { mmap_read_unlock(dst_mm); return -EINVAL; } @@ -483,6 +518,9 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd, if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) { return mfill_atomic_pte_continue(dst_pmd, dst_vma, dst_addr, flags); + } else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + return mfill_atomic_pte_poison(dst_pmd, dst_vma, + dst_addr, flags); } /* @@ -704,6 +742,14 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start, uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE)); } +ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start, + unsigned long len, atomic_t *mmap_changing, + uffd_flags_t flags) +{ + return mfill_atomic(dst_mm, start, 0, len, mmap_changing, + uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON)); +} + long uffd_wp_range(struct vm_area_struct *dst_vma, unsigned long start, unsigned long len, bool enable_wp) { From 597425df4fecd272ca48f73feca7833433c16e12 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 18:27:17 -0700 Subject: [PATCH 074/489] mm: userfaultfd: add new UFFDIO_POISON ioctl: fix Smatch has observed that pte_offset_map_lock() is now allowed to fail, and then ptl should not be unlocked. Use -EAGAIN here like elsewhere. Link: https://lkml.kernel.org/r/bc7bba61-d34f-ad3a-ccf1-c191585ef851@google.com Signed-off-by: Hugh Dickins Reviewed-by: Axel Rasmussen Cc: Dan Carpenter Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 68157359dc3440..dd167184575e2a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -300,7 +300,10 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, spinlock_t *ptl; _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + ret = -EAGAIN; dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); + if (!dst_pte) + goto out; if (mfill_file_over_size(dst_vma, dst_addr)) { ret = -EFAULT; @@ -319,6 +322,7 @@ static int mfill_atomic_pte_poison(pmd_t *dst_pmd, ret = 0; out_unlock: pte_unmap_unlock(dst_pte, ptl); +out: return ret; } From 8a13897fb0daa8f56821f263f0c63661e1c6acae Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:37 -0700 Subject: [PATCH 075/489] mm: userfaultfd: support UFFDIO_POISON for hugetlbfs The behavior here is the same as it is for anon/shmem. This is done separately because hugetlb pte marker handling is a bit different. Link: https://lkml.kernel.org/r/20230707215540.2324998-6-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- mm/hugetlb.c | 19 +++++++++++++++++++ mm/userfaultfd.c | 3 +-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ffee2978dfed4b..7b076eb07a290e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6262,6 +6262,25 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, int writable; bool folio_in_pagecache = false; + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + ptl = huge_pte_lock(h, dst_mm, dst_pte); + + /* Don't overwrite any existing PTEs (even markers) */ + if (!huge_pte_none(huge_ptep_get(dst_pte))) { + spin_unlock(ptl); + return -EEXIST; + } + + _dst_pte = make_pte_marker(PTE_MARKER_POISONED); + set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, dst_pte); + + spin_unlock(ptl); + return 0; + } + if (is_continue) { ret = -EFAULT; folio = filemap_lock_folio(mapping, idx); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index dd167184575e2a..0fc69efa4f1ff8 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -377,8 +377,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb( * by THP. Since we can not reliably insert a zero page, this * feature is not supported. */ - if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE) || - uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { + if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) { mmap_read_unlock(dst_mm); return -EINVAL; } From f442ab50f5fb581804e4c4b6f8ead394fe975589 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:38 -0700 Subject: [PATCH 076/489] mm: userfaultfd: document and enable new UFFDIO_POISON feature Update the userfaultfd API to advertise this feature as part of feature flags and supported ioctls (returned upon registration). Add basic documentation describing the new feature. Link: https://lkml.kernel.org/r/20230707215540.2324998-7-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 15 +++++++++++++++ include/uapi/linux/userfaultfd.h | 9 ++++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 7c304e43220504..4349a8c2b97832 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -244,6 +244,21 @@ write-protected (so future writes will also result in a WP fault). These ioctls support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP`` respectively) to configure the mapping this way. +Memory Poisioning Emulation +--------------------------- + +In response to a fault (either missing or minor), an action userspace can +take to "resolve" it is to issue a ``UFFDIO_POISON``. This will cause any +future faulters to either get a SIGBUS, or in KVM's case the guest will +receive an MCE as if there were hardware memory poisoning. + +This is used to emulate hardware memory poisoning. Imagine a VM running on a +machine which experiences a real hardware memory error. Later, we live migrate +the VM to another physical machine. Since we want the migration to be +transparent to the guest, we want that same address range to act as if it was +still poisoned, even though it's on a new physical host which ostensibly +doesn't have a memory error in the exact same spot. + QEMU/KVM ======== diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index b5f07eacc697ff..62151706c5a38a 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -39,7 +39,8 @@ UFFD_FEATURE_MINOR_SHMEM | \ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ - UFFD_FEATURE_WP_UNPOPULATED) + UFFD_FEATURE_WP_UNPOPULATED | \ + UFFD_FEATURE_POISON) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -49,12 +50,14 @@ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ (__u64)1 << _UFFDIO_WRITEPROTECT | \ - (__u64)1 << _UFFDIO_CONTINUE) + (__u64)1 << _UFFDIO_CONTINUE | \ + (__u64)1 << _UFFDIO_POISON) #define UFFD_API_RANGE_IOCTLS_BASIC \ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ + (__u64)1 << _UFFDIO_WRITEPROTECT | \ (__u64)1 << _UFFDIO_CONTINUE | \ - (__u64)1 << _UFFDIO_WRITEPROTECT) + (__u64)1 << _UFFDIO_POISON) /* * Valid ioctl command number range with this API is from 0x00 to From 7cf0f9e83769cb7862dff0221a3ace67d9b2ed9f Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:39 -0700 Subject: [PATCH 077/489] selftests/mm: refactor uffd_poll_thread to allow custom fault handlers Previously, we had "one fault handler to rule them all", which used several branches to deal with all of the scenarios required by all of the various tests. In upcoming patches, I plan to add a new test, which has its own slightly different fault handling logic. Instead of continuing to add cruft to the existing fault handler, let's allow tests to define custom ones, separate from other tests. Link: https://lkml.kernel.org/r/20230707215540.2324998-8-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-common.c | 5 ++++- tools/testing/selftests/mm/uffd-common.h | 3 +++ tools/testing/selftests/mm/uffd-stress.c | 8 ++++---- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index ba20d750402261..02b89860e193d8 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -499,6 +499,9 @@ void *uffd_poll_thread(void *arg) int ret; char tmp_chr; + if (!args->handle_fault) + args->handle_fault = uffd_handle_page_fault; + pollfd[0].fd = uffd; pollfd[0].events = POLLIN; pollfd[1].fd = pipefd[cpu*2]; @@ -527,7 +530,7 @@ void *uffd_poll_thread(void *arg) err("unexpected msg event %u\n", msg.event); break; case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, args); + args->handle_fault(&msg, args); break; case UFFD_EVENT_FORK: close(uffd); diff --git a/tools/testing/selftests/mm/uffd-common.h b/tools/testing/selftests/mm/uffd-common.h index 197f5262fe0d17..7c4fa964c3b088 100644 --- a/tools/testing/selftests/mm/uffd-common.h +++ b/tools/testing/selftests/mm/uffd-common.h @@ -77,6 +77,9 @@ struct uffd_args { unsigned long missing_faults; unsigned long wp_faults; unsigned long minor_faults; + + /* A custom fault handler; defaults to uffd_handle_page_fault. */ + void (*handle_fault)(struct uffd_msg *msg, struct uffd_args *args); }; struct uffd_test_ops { diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 995ff13e74c780..73ebb97c70264a 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -189,10 +189,8 @@ static int stress(struct uffd_args *args) locking_thread, (void *)cpu)) return 1; if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, - (void *)&args[cpu])) - return 1; + if (pthread_create(&uffd_threads[cpu], &attr, uffd_poll_thread, &args[cpu])) + err("uffd_poll_thread create"); } else { if (pthread_create(&uffd_threads[cpu], &attr, uffd_read_thread, @@ -250,6 +248,8 @@ static int userfaultfd_stress(void) struct uffd_args args[nr_cpus]; uint64_t mem_size = nr_pages * page_size; + memset(args, 0, sizeof(struct uffd_args) * nr_cpus); + if (uffd_test_ctx_init(UFFD_FEATURE_WP_UNPOPULATED, NULL)) err("context init failed"); From 99aa77215ad0254bf15f84e58fe9f3f1d942ff0f Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 7 Jul 2023 14:55:40 -0700 Subject: [PATCH 078/489] selftests/mm: add uffd unit test for UFFDIO_POISON The test is pretty basic, and exercises UFFDIO_POISON straightforwardly. We register a region with userfaultfd, in missing fault mode. For each fault, we either UFFDIO_COPY a zeroed page (odd pages) or UFFDIO_POISON (even pages). We do this mix to test "something like a real use case", where guest memory would be some mix of poisoned and non-poisoned pages. We read each page in the region, and assert that the odd pages are zeroed as expected, and the even pages yield a SIGBUS as expected. Why UFFDIO_COPY instead of UFFDIO_ZEROPAGE? Because hugetlb doesn't support UFFDIO_ZEROPAGE, and we don't want to have special case code. Link: https://lkml.kernel.org/r/20230707215540.2324998-9-axelrasmussen@google.com Signed-off-by: Axel Rasmussen Acked-by: Peter Xu Cc: Al Viro Cc: Brian Geffon Cc: Christian Brauner Cc: David Hildenbrand Cc: Gaosheng Cui Cc: Huang, Ying Cc: Hugh Dickins Cc: James Houghton Cc: Jan Alexander Steffens (heftig) Cc: Jiaqi Yan Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Liam R. Howlett Cc: Miaohe Lin Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Muchun Song Cc: Nadav Amit Cc: Naoya Horiguchi Cc: Ryan Roberts Cc: Shuah Khan Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: T.J. Alumbaugh Cc: Yu Zhao Cc: ZhangPeng Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-unit-tests.c | 117 +++++++++++++++++++ 1 file changed, 117 insertions(+) diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index 04d91f144d1cba..2709a34a39c52d 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -951,6 +951,117 @@ static void uffd_zeropage_test(uffd_test_args_t *args) uffd_test_pass(); } +static void uffd_register_poison(int uffd, void *addr, uint64_t len) +{ + uint64_t ioctls = 0; + uint64_t expected = (1 << _UFFDIO_COPY) | (1 << _UFFDIO_POISON); + + if (uffd_register_with_ioctls(uffd, addr, len, true, + false, false, &ioctls)) + err("poison register fail"); + + if ((ioctls & expected) != expected) + err("registered area doesn't support COPY and POISON ioctls"); +} + +static void do_uffdio_poison(int uffd, unsigned long offset) +{ + struct uffdio_poison uffdio_poison = { 0 }; + int ret; + __s64 res; + + uffdio_poison.range.start = (unsigned long) area_dst + offset; + uffdio_poison.range.len = page_size; + uffdio_poison.mode = 0; + ret = ioctl(uffd, UFFDIO_POISON, &uffdio_poison); + res = uffdio_poison.updated; + + if (ret) + err("UFFDIO_POISON error: %"PRId64, (int64_t)res); + else if (res != page_size) + err("UFFDIO_POISON unexpected size: %"PRId64, (int64_t)res); +} + +static void uffd_poison_handle_fault( + struct uffd_msg *msg, struct uffd_args *args) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & + (UFFD_PAGEFAULT_FLAG_WP | UFFD_PAGEFAULT_FLAG_MINOR)) + err("unexpected fault type %llu", msg->arg.pagefault.flags); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + /* Odd pages -> copy zeroed page; even pages -> poison. */ + if (offset & page_size) + copy_page(uffd, offset, false); + else + do_uffdio_poison(uffd, offset); +} + +static void uffd_poison_test(uffd_test_args_t *targs) +{ + pthread_t uffd_mon; + char c; + struct uffd_args args = { 0 }; + struct sigaction act = { 0 }; + unsigned long nr_sigbus = 0; + unsigned long nr; + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffd_register_poison(uffd, area_dst, nr_pages * page_size); + memset(area_src, 0, nr_pages * page_size); + + args.handle_fault = uffd_poison_handle_fault; + if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &args)) + err("uffd_poll_thread create"); + + sigbuf = &jbuf; + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + + for (nr = 0; nr < nr_pages; ++nr) { + unsigned long offset = nr * page_size; + const char *bytes = (const char *) area_dst + offset; + const char *i; + + if (sigsetjmp(*sigbuf, 1)) { + /* + * Access below triggered a SIGBUS, which was caught by + * sighndl, which then jumped here. Count this SIGBUS, + * and move on to next page. + */ + ++nr_sigbus; + continue; + } + + for (i = bytes; i < bytes + page_size; ++i) { + if (*i) + err("nonzero byte in area_dst (%p) at %p: %u", + area_dst, i, *i); + } + } + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + err("pthread_join()"); + + if (nr_sigbus != nr_pages / 2) + err("expected to receive %lu SIGBUS, actually received %lu", + nr_pages / 2, nr_sigbus); + + uffd_test_pass(); +} + /* * Test the returned uffdio_register.ioctls with different register modes. * Note that _UFFDIO_ZEROPAGE is tested separately in the zeropage test. @@ -1126,6 +1237,12 @@ uffd_test_case_t uffd_tests[] = { UFFD_FEATURE_PAGEFAULT_FLAG_WP | UFFD_FEATURE_WP_HUGETLBFS_SHMEM, }, + { + .name = "poison", + .uffd_fn = uffd_poison_test, + .mem_targets = MEM_ALL, + .uffd_feature_required = UFFD_FEATURE_POISON, + }, }; static void usage(const char *prog) From f9044f170c5e89a12116066d1b9b932e934b9efb Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Sun, 9 Jul 2023 11:56:26 +0900 Subject: [PATCH 079/489] zsmalloc: remove obj_tagged() obj_tagged() is not needed at this point, because objects can only have one tag: OBJ_ALLOCATED_TAG. We needed obj_tagged() for the zsmalloc LRU implementation, which has now been removed. Simplify zsmalloc code and revert to the previous implementation that was in place before the zsmalloc LRU series. Link: https://lkml.kernel.org/r/20230709025817.3842416-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Acked-by: Nhat Pham Cc: Minchan Kim Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 84beadc088b833..32f5bc4074df3c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -795,8 +795,8 @@ static unsigned long handle_to_obj(unsigned long handle) return *(unsigned long *)handle; } -static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, - int tag) +static inline bool obj_allocated(struct page *page, void *obj, + unsigned long *phandle) { unsigned long handle; struct zspage *zspage = get_zspage(page); @@ -807,7 +807,7 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, } else handle = *(unsigned long *)obj; - if (!(handle & tag)) + if (!(handle & OBJ_ALLOCATED_TAG)) return false; /* Clear all tags before returning the handle */ @@ -815,11 +815,6 @@ static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle, return true; } -static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) -{ - return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG); -} - static void reset_page(struct page *page) { __ClearPageMovable(page); @@ -1551,11 +1546,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, } /* - * Find object with a certain tag in zspage from index object and + * Find alloced object in zspage from index object and * return handle. */ -static unsigned long find_tagged_obj(struct size_class *class, - struct page *page, int *obj_idx, int tag) +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int *obj_idx) { unsigned int offset; int index = *obj_idx; @@ -1566,7 +1561,7 @@ static unsigned long find_tagged_obj(struct size_class *class, offset += class->size * index; while (offset < PAGE_SIZE) { - if (obj_tagged(page, addr + offset, &handle, tag)) + if (obj_allocated(page, addr + offset, &handle)) break; offset += class->size; @@ -1580,16 +1575,6 @@ static unsigned long find_tagged_obj(struct size_class *class, return handle; } -/* - * Find alloced object in zspage from index object and - * return handle. - */ -static unsigned long find_alloced_obj(struct size_class *class, - struct page *page, int *obj_idx) -{ - return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG); -} - static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, struct zspage *dst_zspage) { From b894da0468640f610d47624e872dc11f2ae5bb4b Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 10 Jul 2023 09:37:50 +0000 Subject: [PATCH 080/489] mm/mm_init.c: mark check_for_memory() as __init The only caller of check_for_memory() is free_area_init(), which is annotated with __init, so it should be safe to also mark the former as __init. Link: https://lkml.kernel.org/r/20230710093750.1294-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/mm_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index f90db54e2b21d1..2daae1dd575553 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1736,7 +1736,7 @@ static void __init free_area_init_node(int nid) } /* Any regular or high memory on that node ? */ -static void check_for_memory(pg_data_t *pgdat) +static void __init check_for_memory(pg_data_t *pgdat) { enum zone_type zone_type; From d03668803bf03bd1353a902edfbc139a6a6c7bc1 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Jul 2023 22:22:23 -0700 Subject: [PATCH 081/489] HWPOISON: offline support: fix spelling in Documentation/ABI/ Correct spelling problems as identified by codespell. Link: https://lkml.kernel.org/r/20230710052223.18254-1-rdunlap@infradead.org Fixes: facb6011f399 ("HWPOISON: Add soft page offline support") Signed-off-by: Randy Dunlap Cc: Andi Kleen Cc: Andi Kleen Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-memory-page-offline | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-memory-page-offline b/Documentation/ABI/testing/sysfs-memory-page-offline index e14703f12fdf38..00f4e35f916f01 100644 --- a/Documentation/ABI/testing/sysfs-memory-page-offline +++ b/Documentation/ABI/testing/sysfs-memory-page-offline @@ -10,7 +10,7 @@ Description: dropping it if possible. The kernel will then be placed on the bad page list and never be reused. - The offlining is done in kernel specific granuality. + The offlining is done in kernel specific granularity. Normally it's the base page size of the kernel, but this might change. @@ -35,7 +35,7 @@ Description: to access this page assuming it's poisoned by the hardware. - The offlining is done in kernel specific granuality. + The offlining is done in kernel specific granularity. Normally it's the base page size of the kernel, but this might change. From de7cb03db05a4b460edefff266bbaead70a11634 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 11 Jul 2023 19:40:50 +0200 Subject: [PATCH 082/489] mm/memory_hotplug: document the signal_pending() check in offline_pages() Let's update the documentation that any signal is sufficient, and add a comment that not only checking for fatal signals is historical baggage: changing it now could break existing user space. although unlikely. For example, when an app provides a custom SIGALRM handler and triggers memory offlining, the timeout cmd would no longer stop memory offlining, because SIGALRM would no longer be considered a fatal signal. Note that using signal_pending() instead of fatal_signal_pending() is an anti-pattern, but slowly deprecating that behavior to eventually change it in the far future is probably not worth the effort. If this ever becomes relevant for user-space, we might want to rethink. Link: https://lkml.kernel.org/r/20230711174050.603820-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/memory-hotplug.rst | 2 +- mm/memory_hotplug.c | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index 1b02fe5807cc6c..bd77841041af04 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -669,7 +669,7 @@ when still encountering permanently unmovable pages within ZONE_MOVABLE (-> BUG), memory offlining will keep retrying until it eventually succeeds. When offlining is triggered from user space, the offlining context can be -terminated by sending a fatal signal. A timeout based offlining can easily be +terminated by sending a signal. A timeout based offlining can easily be implemented via:: % timeout $TIMEOUT offline_block | failure_handling diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3f231cf1b4106e..7cfd13c91568a2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1843,6 +1843,11 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, do { pfn = start_pfn; do { + /* + * Historically we always checked for any signal and + * can't limit it to fatal signals without eventually + * breaking user space. + */ if (signal_pending(current)) { ret = -EINTR; reason = "signal backoff"; From dbe70dbb41ab45a9ea2fa537c9e6c9817477dfff Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:09 +0800 Subject: [PATCH 083/489] mm: memory-failure: remove unneeded PageHuge() check Patch series "A few fixup and cleanup patches for memory-failure", v2. This series contains a few fixup patches to fix inaccurate mf_stats, fix race window when trying to get hugetlb folio and so on. Also there is minor cleanup for comments and codestyle. More details can be found in the respective changelogs. This patch (of 8): PageHuge() check in me_huge_page() is just for potential problems. Remove it as it's actually dead code and won't catch anything. Link: https://lkml.kernel.org/r/20230711055016.2286677-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20230711055016.2286677-2-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 76da955bf10fb8..9ed2bd25826909 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1187,9 +1187,6 @@ static int me_huge_page(struct page_state *ps, struct page *p) struct address_space *mapping; bool extra_pins = false; - if (!PageHuge(hpage)) - return MF_DELAYED; - mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, page_to_pfn(p), mapping); From 92a025a790f82c278cc39b0997e9b3b6f3b69ee0 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:10 +0800 Subject: [PATCH 084/489] mm: memory-failure: ensure moving HWPoison flag to the raw error pages If hugetlb_vmemmap_optimized is enabled, folio_clear_hugetlb_hwpoison() called from try_memory_failure_hugetlb() won't transfer HWPoison flag to subpages while folio's HWPoison flag is cleared. So when trying to free this hugetlb page into buddy, folio_clear_hugetlb_hwpoison() is not called to move HWPoison flag from head page to the raw error pages even if now hugetlb_vmemmap_optimized is cleared. This will results in HWPoisoned page being used again and raw_hwp_page leak. Link: https://lkml.kernel.org/r/20230711055016.2286677-3-linmiaohe@huawei.com Fixes: ac5fcde0a96a ("mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9ed2bd25826909..71b4bb691c47e6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1908,6 +1908,8 @@ void folio_clear_hugetlb_hwpoison(struct folio *folio) { if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return; + if (folio_test_hugetlb_vmemmap_optimized(folio)) + return; folio_clear_hwpoison(folio); folio_free_raw_hwp(folio, true); } From 80ee7cb271b52e5861eda3c67731c95fd55a2627 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:11 +0800 Subject: [PATCH 085/489] mm: memory-failure: don't account hwpoison_filter() filtered pages mf_generic_kill_procs() will return -EOPNOTSUPP when hwpoison_filter() filtered dax page. In that case, action_result() isn't expected to be called to update mf_stats. This will results in inaccurate but benign memory failure handling statistics. Link: https://lkml.kernel.org/r/20230711055016.2286677-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 71b4bb691c47e6..cac5413c5cc3a9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2098,7 +2098,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, out: /* drop pgmap ref acquired in caller */ put_dev_pagemap(pgmap); - action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); + if (rc != -EOPNOTSUPP) + action_result(pfn, MF_MSG_DAX, rc ? MF_FAILED : MF_RECOVERED); return rc; } From 55c7ac4527086d52dedc5da4ee3d676bcc9a7691 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:12 +0800 Subject: [PATCH 086/489] mm: memory-failure: use local variable huge to check hugetlb page Use local variable huge to check whether page is hugetlb page to avoid calling PageHuge() multiple times to save cpu cycles. PageHuge() will be stable while extra page refcnt is held. Link: https://lkml.kernel.org/r/20230711055016.2286677-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cac5413c5cc3a9..3b734d51e6dedb 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2617,7 +2617,7 @@ static int soft_offline_in_use_page(struct page *page) } lock_page(page); - if (!PageHuge(page)) + if (!huge) wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); @@ -2626,7 +2626,7 @@ static int soft_offline_in_use_page(struct page *page) return 0; } - if (!PageHuge(page) && PageLRU(page) && !PageSwapCache(page)) + if (!huge && PageLRU(page) && !PageSwapCache(page)) /* * Try to invalidate first. This should work for * non dirty unmapped page cache pages. From e9c36f7aca7efee8318b12930b846464b9b5c7a3 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:13 +0800 Subject: [PATCH 087/489] mm: memory-failure: remove unneeded header files Remove some unneeded header files. No functional change intended. Link: https://lkml.kernel.org/r/20230711055016.2286677-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3b734d51e6dedb..44a0ce3849f380 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -50,7 +49,6 @@ #include #include #include -#include #include #include #include @@ -59,7 +57,6 @@ #include #include #include -#include #include #include #include From 5885c6a62533cbda19e9eceab619bde317de0c0d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:14 +0800 Subject: [PATCH 088/489] mm: memory-failure: minor cleanup for comments and codestyle Fix some wrong function names and grammar error in comments. Also remove unneeded space after for_each_process. No functional change intended. Link: https://lkml.kernel.org/r/20230711055016.2286677-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 44a0ce3849f380..36529f3c6554a2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -608,7 +608,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); - for_each_process (tsk) { + for_each_process(tsk) { struct anon_vma_chain *vmac; struct task_struct *t = task_early_kill(tsk, force_early); @@ -652,7 +652,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, /* * Send early kill signal to tasks where a vma covers * the page but the corrupted page is not necessarily - * mapped it in its pte. + * mapped in its pte. * Assume applications who requested early kill want * to be informed of all such data corruptions. */ @@ -2117,7 +2117,7 @@ static DEFINE_MUTEX(mf_mutex); * detected by a background scrubber) * * Must run in process context (e.g. a work queue) with interrupts - * enabled and no spinlocks hold. + * enabled and no spinlocks held. * * Return: 0 for successfully handled the memory error, * -EOPNOTSUPP for hwpoison_filter() filtered the error event, @@ -2221,7 +2221,7 @@ int memory_failure(unsigned long pfn, int flags) * otherwise it may race with THP split. * And the flag can't be set in get_hwpoison_page() since * it is called by soft offline too and it is just called - * for !MF_COUNT_INCREASE. So here seems to be the best + * for !MF_COUNT_INCREASED. So here seems to be the best * place. * * Don't need care about the above error handling paths for @@ -2578,10 +2578,10 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) /* * If we succeed to isolate the page, we grabbed another refcount on - * the page, so we can safely drop the one we got from get_any_pages(). + * the page, so we can safely drop the one we got from get_any_page(). * If we failed to isolate the page, it means that we cannot go further * and we will return an error, so drop the reference we got from - * get_any_pages() as well. + * get_any_page() as well. */ put_page(page); return isolated; From a363d1224b5add67a7cafab9fdb9f19d569fbe98 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:15 +0800 Subject: [PATCH 089/489] mm: memory-failure: fetch compound head after extra page refcnt is held Page might become thp, huge page or being splited after compound head is fetched but before page refcnt is bumped. So hpage might be a tail page leading to VM_BUG_ON_PAGE(PageTail(page)) in PageTransHuge(). Link: https://lkml.kernel.org/r/20230711055016.2286677-8-linmiaohe@huawei.com Fixes: 415c64c1453a ("mm/memory-failure: split thp earlier in memory error handling") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 36529f3c6554a2..133737580a7ecd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2175,8 +2175,6 @@ int memory_failure(unsigned long pfn, int flags) goto unlock_mutex; } - hpage = compound_head(p); - /* * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: @@ -2215,6 +2213,7 @@ int memory_failure(unsigned long pfn, int flags) } } + hpage = compound_head(p); if (PageTransHuge(hpage)) { /* * The flag must be set after the refcount is bumped From d31155b8f29ce380f7816e54dee161db6d752909 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 11 Jul 2023 13:50:16 +0800 Subject: [PATCH 090/489] mm: memory-failure: fix race window when trying to get hugetlb folio page_folio() is fetched before calling get_hwpoison_hugetlb_folio() without hugetlb_lock being held. So hugetlb page could be demoted before get_hwpoison_hugetlb_folio() holding hugetlb_lock but after page_folio() is fetched. So get_hwpoison_hugetlb_folio() will hold unexpected extra refcnt of hugetlb folio while leaving demoted page un-refcnted. Link: https://lkml.kernel.org/r/20230711055016.2286677-9-linmiaohe@huawei.com Fixes: 25182f05ffed ("mm,hwpoison: fix race with hugetlb page allocation") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 133737580a7ecd..70f44180ef8040 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1383,8 +1383,15 @@ static int __get_hwpoison_page(struct page *page, unsigned long flags) bool hugetlb = false; ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, false); - if (hugetlb) - return ret; + if (hugetlb) { + /* Make sure hugetlb demotion did not happen from under us. */ + if (folio == page_folio(page)) + return ret; + if (ret > 0) { + folio_put(folio); + folio = page_folio(page); + } + } /* * This check prevents from calling folio_try_get() for any @@ -1473,8 +1480,13 @@ static int __get_unpoison_page(struct page *page) bool hugetlb = false; ret = get_hwpoison_hugetlb_folio(folio, &hugetlb, true); - if (hugetlb) - return ret; + if (hugetlb) { + /* Make sure hugetlb demotion did not happen from under us. */ + if (folio == page_folio(page)) + return ret; + if (ret > 0) + folio_put(folio); + } /* * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, From 86aa6998ad00af823de81d12d41d7063c14298a0 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 10 Jul 2023 22:35:44 -0700 Subject: [PATCH 091/489] mm/memory: pass folio into do_page_mkwrite() Saves one implicit call to compound_head(). I'm not sure if I should change the name of the function to do_folio_mkwrite() and update the description comment to reference a folio as the vm_op is still called page_mkwrite. Link: https://lkml.kernel.org/r/20230711053544.156617-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Suggested-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 36b164ee9ffb0b..44d11812a88f2e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2933,10 +2933,9 @@ static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) * * We do this without the lock held, so that it can sleep if it needs to. */ -static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t do_page_mkwrite(struct vm_fault *vmf, struct folio *folio) { vm_fault_t ret; - struct folio *folio = page_folio(vmf->page); unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; @@ -3298,7 +3297,7 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); - tmp = do_page_mkwrite(vmf); + tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); @@ -4621,7 +4620,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) */ if (vma->vm_ops->page_mkwrite) { folio_unlock(folio); - tmp = do_page_mkwrite(vmf); + tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { folio_put(folio); From d695c30a8ca07ac7e2138435b461b36289d5656e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:38 +0800 Subject: [PATCH 092/489] maple_tree: don't use MAPLE_ARANGE64_META_MAX to indicate no gap Patch series "Improve the validation for maple tree and some cleanup", v2. This patch (of 7): Do not use a special offset to indicate that there is no gap. When there is no gap, offset can point to any valid slots because its gap is 0. Link: https://lkml.kernel.org/r/20230711035444.526-1-zhangpeng.00@bytedance.com Link: https://lkml.kernel.org/r/20230711035444.526-3-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 -- lib/maple_tree.c | 13 ++----------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6e5bd2c9875d64..7769270b85e89e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -29,14 +29,12 @@ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ -#define MAPLE_ARANGE64_META_MAX 15 /* Out of range for metadata */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ -#define MAPLE_ARANGE64_META_MAX 31 /* Out of range for metadata */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 7fad4a7a0b058c..da2c8542ad2fce 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1610,8 +1610,6 @@ ma_max_gap(struct maple_node *node, unsigned long *gaps, enum maple_type mt, * mas_max_gap() - find the largest gap in a non-leaf node and set the slot. * @mas: The maple state. * - * If the metadata gap is set to MAPLE_ARANGE64_META_MAX, there is no gap. - * * Return: The gap value. */ static inline unsigned long mas_max_gap(struct ma_state *mas) @@ -1628,9 +1626,6 @@ static inline unsigned long mas_max_gap(struct ma_state *mas) node = mas_mn(mas); MAS_BUG_ON(mas, mt != maple_arange_64); offset = ma_meta_gap(node, mt); - if (offset == MAPLE_ARANGE64_META_MAX) - return 0; - gaps = ma_gaps(node, mt); return gaps[offset]; } @@ -1662,10 +1657,7 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, ascend: MAS_BUG_ON(mas, pmt != maple_arange_64); meta_offset = ma_meta_gap(pnode, pmt); - if (meta_offset == MAPLE_ARANGE64_META_MAX) - meta_gap = 0; - else - meta_gap = pgaps[meta_offset]; + meta_gap = pgaps[meta_offset]; pgaps[offset] = new; @@ -1678,7 +1670,6 @@ static inline void mas_parent_gap(struct ma_state *mas, unsigned char offset, ma_set_meta_gap(pnode, pmt, offset); } else if (new < meta_gap) { - meta_offset = 15; new = ma_max_gap(pnode, pgaps, pmt, &meta_offset); ma_set_meta_gap(pnode, pmt, meta_offset); } @@ -2076,7 +2067,7 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, end = j - 1; if (likely(!ma_is_leaf(mt) && mt_is_alloc(mas->tree))) { unsigned long max_gap = 0; - unsigned char offset = 15; + unsigned char offset = 0; gaps = ma_gaps(node, mt); do { From f8e5eac8abe3d26106e5470c735058f04f60f61e Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:39 +0800 Subject: [PATCH 093/489] maple_tree: make mas_validate_gaps() to check metadata Make mas_validate_gaps() check whether the offset in the metadata points to the largest gap. By the way, simplify this function. Add the verification that gaps beyond the node limit are zero. Link: https://lkml.kernel.org/r/20230711035444.526-4-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 78 ++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index da2c8542ad2fce..9ce78e5e608403 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6957,15 +6957,16 @@ EXPORT_SYMBOL_GPL(mt_dump); static void mas_validate_gaps(struct ma_state *mas) { struct maple_enode *mte = mas->node; - struct maple_node *p_mn; + struct maple_node *p_mn, *node = mte_to_node(mte); + enum maple_type mt = mte_node_type(mas->node); unsigned long gap = 0, max_gap = 0; unsigned long p_end, p_start = mas->min; - unsigned char p_slot; + unsigned char p_slot, offset; unsigned long *gaps = NULL; - unsigned long *pivots = ma_pivots(mte_to_node(mte), mte_node_type(mte)); - int i; + unsigned long *pivots = ma_pivots(node, mt); + unsigned int i; - if (ma_is_dense(mte_node_type(mte))) { + if (ma_is_dense(mt)) { for (i = 0; i < mt_slot_count(mte); i++) { if (mas_get_slot(mas, i)) { if (gap > max_gap) @@ -6978,52 +6979,59 @@ static void mas_validate_gaps(struct ma_state *mas) goto counted; } - gaps = ma_gaps(mte_to_node(mte), mte_node_type(mte)); + gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { - p_end = mas_logical_pivot(mas, pivots, i, mte_node_type(mte)); + p_end = mas_logical_pivot(mas, pivots, i, mt); if (!gaps) { - if (mas_get_slot(mas, i)) { - gap = 0; - goto not_empty; - } - - gap += p_end - p_start + 1; + if (!mas_get_slot(mas, i)) + gap = p_end - p_start + 1; } else { void *entry = mas_get_slot(mas, i); gap = gaps[i]; - if (!entry) { - if (gap != p_end - p_start + 1) { - pr_err("%p[%u] -> %p %lu != %lu - %lu + 1\n", - mas_mn(mas), i, - mas_get_slot(mas, i), gap, - p_end, p_start); - mt_dump(mas->tree, mt_dump_hex); - - MT_BUG_ON(mas->tree, - gap != p_end - p_start + 1); - } - } else { - if (gap > p_end - p_start + 1) { - pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", - mas_mn(mas), i, gap, p_end, p_start, - p_end - p_start + 1); - MT_BUG_ON(mas->tree, - gap > p_end - p_start + 1); - } + MT_BUG_ON(mas->tree, !entry); + + if (gap > p_end - p_start + 1) { + pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", + mas_mn(mas), i, gap, p_end, p_start, + p_end - p_start + 1); + MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); } } if (gap > max_gap) max_gap = gap; -not_empty: + p_start = p_end + 1; if (p_end >= mas->max) break; } counted: + if (mt == maple_arange_64) { + offset = ma_meta_gap(node, mt); + if (offset > i) { + pr_err("gap offset %p[%u] is invalid\n", node, offset); + MT_BUG_ON(mas->tree, 1); + } + + if (gaps[offset] != max_gap) { + pr_err("gap %p[%u] is not the largest gap %lu\n", + node, offset, max_gap); + MT_BUG_ON(mas->tree, 1); + } + + MT_BUG_ON(mas->tree, !gaps); + for (i++ ; i < mt_slot_count(mte); i++) { + if (gaps[i] != 0) { + pr_err("gap %p[%u] beyond node limit != 0\n", + node, i); + MT_BUG_ON(mas->tree, 1); + } + } + } + if (mte_is_root(mte)) return; @@ -7033,10 +7041,8 @@ static void mas_validate_gaps(struct ma_state *mas) if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); + MT_BUG_ON(mas->tree, 1); } - - MT_BUG_ON(mas->tree, - ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap); } static void mas_validate_parent_slot(struct ma_state *mas) From e93fda5a1ab7a0c6143ae8a6f231c9f5f3c417b1 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:40 +0800 Subject: [PATCH 094/489] maple_tree: fix mas_validate_child_slot() to check last missed slot Don't break the loop before checking the last slot. Also here check if non-leaf nodes are missing children. Link: https://lkml.kernel.org/r/20230711035444.526-5-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 9ce78e5e608403..af8fb75ad68866 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7093,11 +7093,12 @@ static void mas_validate_child_slot(struct ma_state *mas) for (i = 0; i < mt_slots[type]; i++) { child = mas_slot(mas, slots, i); - if (!pivots[i] || pivots[i] == mas->max) - break; - if (!child) - break; + if (!child) { + pr_err("Non-leaf node lacks child at %p[%u]\n", + mas_mn(mas), i); + MT_BUG_ON(mas->tree, 1); + } if (mte_parent_slot(child) != i) { pr_err("Slot error at %p[%u]: child %p has pslot %u\n", @@ -7112,6 +7113,9 @@ static void mas_validate_child_slot(struct ma_state *mas) mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); } + + if (i < mt_pivots[type] && pivots[i] == mas->max) + break; } } From 33af39d0244ce4944ab16728f7b04df9dfc6d365 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:41 +0800 Subject: [PATCH 095/489] maple_tree: make mas_validate_limits() check root node and node limit Update mas_validate_limits() to check root node, check node limit pivot if there is enough room for it to exist and check data_end. Remove the check for child existence as it is done in mas_validate_child_slot(). Link: https://lkml.kernel.org/r/20230711035444.526-6-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index af8fb75ad68866..31ac4f2c442644 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7120,7 +7120,9 @@ static void mas_validate_child_slot(struct ma_state *mas) } /* - * Validate all pivots are within mas->min and mas->max. + * Validate all pivots are within mas->min and mas->max, check metadata ends + * where the maximum ends and ensure there is no slots or pivots set outside of + * the end of the data. */ static void mas_validate_limits(struct ma_state *mas) { @@ -7130,26 +7132,15 @@ static void mas_validate_limits(struct ma_state *mas) void __rcu **slots = ma_slots(mte_to_node(mas->node), type); unsigned long *pivots = ma_pivots(mas_mn(mas), type); - /* all limits are fine here. */ - if (mte_is_root(mas->node)) - return; - for (i = 0; i < mt_slots[type]; i++) { unsigned long piv; piv = mas_safe_pivot(mas, pivots, i, type); - if (!piv && (i != 0)) - break; - - if (!mte_is_leaf(mas->node)) { - void *entry = mas_slot(mas, slots, i); - - if (!entry) - pr_err("%p[%u] cannot be null\n", - mas_mn(mas), i); - - MT_BUG_ON(mas->tree, !entry); + if (!piv && (i != 0)) { + pr_err("Missing node limit pivot at %p[%u]", + mas_mn(mas), i); + MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { @@ -7172,6 +7163,13 @@ static void mas_validate_limits(struct ma_state *mas) if (piv == mas->max) break; } + + if (mas_data_end(mas) != i) { + pr_err("node%p: data_end %u != the last slot offset %u\n", + mas_mn(mas), mas_data_end(mas), i); + MT_BUG_ON(mas->tree, 1); + } + for (i += 1; i < mt_slots[type]; i++) { void *entry = mas_slot(mas, slots, i); From a489539e33c29b469bcd023a32c99078c2597c7c Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:42 +0800 Subject: [PATCH 096/489] maple_tree: update mt_validate() Instead of using mas_first_entry() to find the leftmost leaf, use a simple loop instead. Remove an unneeded check for root node. To make the error message more accurate, check pivots first and then slots, because checking slots depend on the node limit pivot to break the loop. Link: https://lkml.kernel.org/r/20230711035444.526-7-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Tested-by: Geert Uytterhoeven Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 31ac4f2c442644..e08ef44926c6f1 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7244,21 +7244,20 @@ void mt_validate(struct maple_tree *mt) if (!mas_searchable(&mas)) goto done; - mas_first_entry(&mas, mas_mn(&mas), ULONG_MAX, mte_node_type(mas.node)); + while (!mte_is_leaf(mas.node)) + mas_descend(&mas); + while (!mas_is_none(&mas)) { MAS_WARN_ON(&mas, mte_dead_node(mas.node)); - if (!mte_is_root(mas.node)) { - end = mas_data_end(&mas); - if (MAS_WARN_ON(&mas, - (end < mt_min_slot_count(mas.node)) && - (mas.max != ULONG_MAX))) { - pr_err("Invalid size %u of %p\n", end, - mas_mn(&mas)); - } + end = mas_data_end(&mas); + if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && + (mas.max != ULONG_MAX))) { + pr_err("Invalid size %u of %p\n", end, mas_mn(&mas)); } + mas_validate_parent_slot(&mas); - mas_validate_child_slot(&mas); mas_validate_limits(&mas); + mas_validate_child_slot(&mas); if (mt_is_alloc(mt)) mas_validate_gaps(&mas); mas_dfs_postorder(&mas, ULONG_MAX); From 29b2681f1aa95cff6ec0afdeac0b2cab659a5564 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:43 +0800 Subject: [PATCH 097/489] maple_tree: replace mas_logical_pivot() with mas_safe_pivot() Replace mas_logical_pivot() with mas_safe_pivot() and drop mas_logical_pivot() since it won't be used anymore. We can do this since now all nodes will have node limit pivot (if it is not full node). Link: https://lkml.kernel.org/r/20230711035444.526-8-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 33 +++------------------------------ 1 file changed, 3 insertions(+), 30 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e08ef44926c6f1..4f3209ca0e3b0b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -728,33 +728,6 @@ mas_safe_min(struct ma_state *mas, unsigned long *pivots, unsigned char offset) return mas->min; } -/* - * mas_logical_pivot() - Get the logical pivot of a given offset. - * @mas: The maple state - * @pivots: The pointer to the maple node pivots - * @offset: The offset into the pivot array - * @type: The maple node type - * - * When there is no value at a pivot (beyond the end of the data), then the - * pivot is actually @mas->max. - * - * Return: the logical pivot of a given @offset. - */ -static inline unsigned long -mas_logical_pivot(struct ma_state *mas, unsigned long *pivots, - unsigned char offset, enum maple_type type) -{ - unsigned long lpiv = mas_safe_pivot(mas, pivots, offset, type); - - if (likely(lpiv)) - return lpiv; - - if (likely(offset)) - return mas->max; - - return lpiv; -} - /* * mte_set_pivot() - Set a pivot to a value in an encoded maple node. * @mn: The encoded maple node @@ -2202,7 +2175,7 @@ static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, goto b_end; /* Handle new range ending before old range ends */ - piv = mas_logical_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); + piv = mas_safe_pivot(mas, wr_mas->pivots, offset_end, wr_mas->type); if (piv > mas->last) { if (piv == ULONG_MAX) mas_bulk_rebalance(mas, b_node->b_end, wr_mas->type); @@ -4935,7 +4908,7 @@ static inline bool mas_anode_descend(struct ma_state *mas, unsigned long size) min = mas_safe_min(mas, pivots, offset); data_end = ma_data_end(node, type, pivots, mas->max); for (; offset <= data_end; offset++) { - pivot = mas_logical_pivot(mas, pivots, offset, type); + pivot = mas_safe_pivot(mas, pivots, offset, type); /* Not within lower bounds */ if (mas->index > pivot) @@ -6981,7 +6954,7 @@ static void mas_validate_gaps(struct ma_state *mas) gaps = ma_gaps(node, mt); for (i = 0; i < mt_slot_count(mte); i++) { - p_end = mas_logical_pivot(mas, pivots, i, mt); + p_end = mas_safe_pivot(mas, pivots, i, mt); if (!gaps) { if (!mas_get_slot(mas, i)) From 6783bd4b5f72b483cf492dc09500548b495670b5 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:44 +0800 Subject: [PATCH 098/489] maple_tree: drop mas_first_entry() The internal function mas_first_entry() is no longer used, so drop it. Link: https://lkml.kernel.org/r/20230711035444.526-9-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton --- lib/maple_tree.c | 72 ------------------------------------------------ 1 file changed, 72 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4f3209ca0e3b0b..cef47ce8edddf0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6636,78 +6636,6 @@ static inline struct maple_enode *mas_get_slot(struct ma_state *mas, offset); } - -/* - * mas_first_entry() - Go the first leaf and find the first entry. - * @mas: the maple state. - * @limit: the maximum index to check. - * @*r_start: Pointer to set to the range start. - * - * Sets mas->offset to the offset of the entry, r_start to the range minimum. - * - * Return: The first entry or MAS_NONE. - */ -static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn, - unsigned long limit, enum maple_type mt) - -{ - unsigned long max; - unsigned long *pivots; - void __rcu **slots; - void *entry = NULL; - - mas->index = mas->min; - if (mas->index > limit) - goto none; - - max = mas->max; - mas->offset = 0; - while (likely(!ma_is_leaf(mt))) { - MAS_WARN_ON(mas, mte_dead_node(mas->node)); - slots = ma_slots(mn, mt); - entry = mas_slot(mas, slots, 0); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) - return NULL; - max = pivots[0]; - mas->node = entry; - mn = mas_mn(mas); - mt = mte_node_type(mas->node); - } - MAS_WARN_ON(mas, mte_dead_node(mas->node)); - - mas->max = max; - slots = ma_slots(mn, mt); - entry = mas_slot(mas, slots, 0); - if (unlikely(ma_dead_node(mn))) - return NULL; - - /* Slot 0 or 1 must be set */ - if (mas->index > limit) - goto none; - - if (likely(entry)) - return entry; - - mas->offset = 1; - entry = mas_slot(mas, slots, 1); - pivots = ma_pivots(mn, mt); - if (unlikely(ma_dead_node(mn))) - return NULL; - - mas->index = pivots[0] + 1; - if (mas->index > limit) - goto none; - - if (likely(entry)) - return entry; - -none: - if (likely(!ma_dead_node(mn))) - mas->node = MAS_NONE; - return NULL; -} - /* Depth first search, post-order */ static void mas_dfs_postorder(struct ma_state *mas, unsigned long max) { From a349d72fd9efc87c8fd1d16d3164752d84a7275b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:30:40 -0700 Subject: [PATCH 099/489] mm/pgtable: add rcu_read_lock() and rcu_read_unlock()s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: free retracted page table by RCU", v3. Some mmap_lock avoidance i.e. latency reduction. Initially just for the case of collapsing shmem or file pages to THPs: the usefulness of MADV_COLLAPSE on shmem is being limited by that mmap_write_lock it currently requires. Likely to be relied upon later in other contexts e.g. freeing of empty page tables (but that's not work I'm doing). mmap_write_lock avoidance when collapsing to anon THPs? Perhaps, but again that's not work I've done: a quick attempt was not as easy as the shmem/file case. These changes (though of course not these exact patches) have been in Google's data centre kernel for three years now: we do rely upon them. This patch (of 13): Before putting them to use (several commits later), add rcu_read_lock() to pte_offset_map(), and rcu_read_unlock() to pte_unmap(). Make this a separate commit, since it risks exposing imbalances: prior commits have fixed all the known imbalances, but we may find some have been missed. Link: https://lkml.kernel.org/r/7cd843a9-aa80-14f-5eb2-33427363c20@google.com Link: https://lkml.kernel.org/r/d3b01da5-2a6-833c-6681-67a3e024a16f@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- mm/pgtable-generic.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5063b482e34f6c..5134edcec6687e 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -99,7 +99,7 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) ((pte_t *)kmap_local_page(pmd_page(*(pmd))) + pte_index((address))) #define pte_unmap(pte) do { \ kunmap_local((pte)); \ - /* rcu_read_unlock() to be added later */ \ + rcu_read_unlock(); \ } while (0) #else static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) @@ -108,7 +108,7 @@ static inline pte_t *__pte_map(pmd_t *pmd, unsigned long address) } static inline void pte_unmap(pte_t *pte) { - /* rcu_read_unlock() to be added later */ + rcu_read_unlock(); } #endif diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4d454953046f10..400e5a045848ee 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -236,7 +236,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { pmd_t pmdval; - /* rcu_read_lock() to be added later */ + rcu_read_lock(); pmdval = pmdp_get_lockless(pmd); if (pmdvalp) *pmdvalp = pmdval; @@ -250,7 +250,7 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) } return __pte_map(&pmdval, addr); nomap: - /* rcu_read_unlock() to be added later */ + rcu_read_unlock(); return NULL; } From 146b42e07494e45f7c7bcf2cbf7afd1424afd78e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:32:05 -0700 Subject: [PATCH 100/489] mm/pgtable: add PAE safety to __pte_offset_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a faint risk that __pte_offset_map(), on a 32-bit architecture with a 64-bit pmd_t e.g. x86-32 with CONFIG_X86_PAE=y, would succeed on a pmdval assembled from a pmd_low and a pmd_high which never belonged together: their combination not pointing to a page table at all, perhaps not even a valid pfn. pmdp_get_lockless() is not enough to prevent that. Guard against that (on such configs) by local_irq_save() blocking TLB flush between present updates, as linux/pgtable.h suggests. It's only needed around the pmdp_get_lockless() in __pte_offset_map(): a race when __pte_offset_map_lock() repeats the pmdp_get_lockless() after getting the lock, would just send it back to __pte_offset_map() again. Complement this pmdp_get_lockless_start() and pmdp_get_lockless_end(), used only locally in __pte_offset_map(), with a pmdp_get_lockless_sync() synonym for tlb_remove_table_sync_one(): to send the necessary interrupt at the right moment on those configs which do not already send it. CONFIG_GUP_GET_PXX_LOW_HIGH is enabled when required by mips, sh and x86. It is not enabled by arm-32 CONFIG_ARM_LPAE: my understanding is that Will Deacon's 2020 enhancements to READ_ONCE() are sufficient for arm. It is not enabled by arc, but its pmd_t is 32-bit even when pte_t 64-bit. Limit the IRQ disablement to CONFIG_HIGHPTE? Perhaps, but would need a little more work, to retry if pmd_low good for page table, but pmd_high non-zero from THP (and that might be making x86-specific assumptions). Link: https://lkml.kernel.org/r/3adcd8f-9191-2df1-d7ea-c4877698aad@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++++ mm/pgtable-generic.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5134edcec6687e..7f2db400f65342 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -390,6 +390,7 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) return pmd; } #define pmdp_get_lockless pmdp_get_lockless +#define pmdp_get_lockless_sync() tlb_remove_table_sync_one() #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* CONFIG_GUP_GET_PXX_LOW_HIGH */ @@ -408,6 +409,9 @@ static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) { return pmdp_get(pmdp); } +static inline void pmdp_get_lockless_sync(void) +{ +} #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 400e5a045848ee..b9a0c2137cc135 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -232,12 +232,41 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ + (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) +/* + * See the comment above ptep_get_lockless() in include/linux/pgtable.h: + * the barriers in pmdp_get_lockless() cannot guarantee that the value in + * pmd_high actually belongs with the value in pmd_low; but holding interrupts + * off blocks the TLB flush between present updates, which guarantees that a + * successful __pte_offset_map() points to a page from matched halves. + */ +static unsigned long pmdp_get_lockless_start(void) +{ + unsigned long irqflags; + + local_irq_save(irqflags); + return irqflags; +} +static void pmdp_get_lockless_end(unsigned long irqflags) +{ + local_irq_restore(irqflags); +} +#else +static unsigned long pmdp_get_lockless_start(void) { return 0; } +static void pmdp_get_lockless_end(unsigned long irqflags) { } +#endif + pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) { + unsigned long irqflags; pmd_t pmdval; rcu_read_lock(); + irqflags = pmdp_get_lockless_start(); pmdval = pmdp_get_lockless(pmd); + pmdp_get_lockless_end(irqflags); + if (pmdvalp) *pmdvalp = pmdval; if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) From de2e4626c70605b7ff5ab32b75336547663d465f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:33:08 -0700 Subject: [PATCH 101/489] arm: adjust_pte() use pte_offset_map_nolock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of pte_lockptr(), use the recently added pte_offset_map_nolock() in adjust_pte(): because it gives the not-locked ptl for precisely that pte, which the caller can then safely lock; whereas pte_lockptr() is not so tightly coupled, because it dereferences the pmd pointer again. Link: https://lkml.kernel.org/r/4d5258bd-ffa0-018-253a-25f2c9b783f7@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/arm/mm/fault-armv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index ca5302b0b7eef9..7cb12549797621 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -117,11 +117,10 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, * must use the nested version. This also means we need to * open-code the spin-locking. */ - pte = pte_offset_map(pmd, address); + pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl); if (!pte) return 0; - ptl = pte_lockptr(vma->vm_mm, pmd); do_pte_lock(ptl); ret = do_adjust_pte(vma, address, pfn, pte); From 3d140215a6aec37f112aec1606c6a76f7e4443d3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:34:25 -0700 Subject: [PATCH 102/489] powerpc: assert_pte_locked() use pte_offset_map_nolock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of pte_lockptr(), use the recently added pte_offset_map_nolock() in assert_pte_locked(). BUG if pte_offset_map_nolock() fails. This mod might cause new crashes: which either expose my ignorance, or indicate issues to be fixed, or limit the usage of assert_pte_locked(). [hughd@google.com: assert_pte_locked() still needs the pmd_none() check] Link: https://lkml.kernel.org/r/c73d1543-532c-3da2-8cf2-a95363a14116@google.com Link: https://lkml.kernel.org/r/e8d56c95-c132-a82e-5f5f-7bb1b738b057@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index cb2dcdb18f8ec4..a3dcdb2d5b4b6a 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -311,6 +311,8 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; + spinlock_t *ptl; if (mm == &init_mm) return; @@ -329,8 +331,10 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - BUG_ON(!pmd_present(*pmd)); - assert_spin_locked(pte_lockptr(mm, pmd)); + pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + BUG_ON(!pte); + assert_spin_locked(ptl); + pte_unmap(pte); } #endif /* CONFIG_DEBUG_VM */ From 32cc0b7c9d508efde8946a82eb3c4acfa8dfed15 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:35:59 -0700 Subject: [PATCH 103/489] powerpc: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add powerpc-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This is awkward because the struct page contains only one rcu_head, but that page may be shared between PTE_FRAG_NR pagetables, each wanting to use the rcu_head at the same time. But powerpc never reuses a fragment once it has been freed: so mark the page Active in pte_free_defer(), before calling pte_fragment_free() directly; and there call_rcu() to pte_free_now() when last fragment is freed and the page is PageActive. Link: https://lkml.kernel.org/r/6e3ca5f1-334d-4b14-b92d-fc8e99914fcb@google.com Suggested-by: Jason Gunthorpe Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgalloc.h | 4 ++++ arch/powerpc/mm/pgtable-frag.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 3360cad78acead..3a971e2a8c73df 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -45,6 +45,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t ptepage) pte_fragment_free((unsigned long *)ptepage, 0); } +/* arch use pte_free_defer() implementation in arch/powerpc/mm/pgtable-frag.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* * Functions that deal with pagetables that could be at any level of * the table need to be passed an "index_size" so they know how to diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 20652daa1d7e3a..0c6b681300256c 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -106,6 +106,15 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) return __alloc_for_ptecache(mm, kernel); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void pte_fragment_free(unsigned long *table, int kernel) { struct page *page = virt_to_page(table); @@ -115,8 +124,22 @@ void pte_fragment_free(unsigned long *table, int kernel) BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); if (atomic_dec_and_test(&page->pt_frag_refcount)) { - if (!kernel) - pgtable_pte_page_dtor(page); - __free_page(page); + if (kernel) + __free_page(page); + else if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } } + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + pte_fragment_free((unsigned long *)pgtable, 0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ From ad1ac8d94cde7709e3ea5360963ff70df2c0b4aa Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:37:24 -0700 Subject: [PATCH 104/489] sparc: add pte_free_defer() for pte_t *pgtable_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add sparc-specific pte_free_defer(), to call pte_free() via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. sparc32 supports pagetables sharing a page, but does not support THP; sparc64 supports THP, but does not support pagetables sharing a page. So the sparc-specific pte_free_defer() is as simple as the generic one, except for converting between pte_t *pgtable_t and struct page *. Link: https://lkml.kernel.org/r/dc4f318d-a66a-5622-dc44-9018ea814b37@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/sparc/include/asm/pgalloc_64.h | 4 ++++ arch/sparc/mm/init_64.c | 16 ++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h index 7b5561d17ab14c..caa7632be4c2ae 100644 --- a/arch/sparc/include/asm/pgalloc_64.h +++ b/arch/sparc/include/asm/pgalloc_64.h @@ -65,6 +65,10 @@ pgtable_t pte_alloc_one(struct mm_struct *mm); void pte_free_kernel(struct mm_struct *mm, pte_t *pte); void pte_free(struct mm_struct *mm, pgtable_t ptepage); +/* arch use pte_free_defer() implementation in arch/sparc/mm/init_64.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + #define pmd_populate_kernel(MM, PMD, PTE) pmd_set(MM, PMD, PTE) #define pmd_populate(MM, PMD, PTE) pmd_set(MM, PMD, PTE) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 04f9db0c311179..0d7fd793924c85 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2930,6 +2930,22 @@ void pgtable_free(void *table, bool is_page) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + __pte_free((pgtable_t)page_address(page)); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + call_rcu(&page->rcu_head, pte_free_now); +} + void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { From 8211dad6279817a8966ff6b74c2c588dd4166f45 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:38:35 -0700 Subject: [PATCH 105/489] s390: add pte_free_defer() for pgtables sharing page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add s390-specific pte_free_defer(), to free table page via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This precedes the generic version to avoid build breakage from incompatible pgtable_t. This version is more complicated than others: because s390 fits two 2K page tables into one 4K page (so page->rcu_head must be shared between both halves), and already uses page->lru (which page->rcu_head overlays) to list any free halves; with clever management by page->_refcount bits. Build upon the existing management, adjusted to follow a new rule: that a page is never on the free list if pte_free_defer() was used on either half (marked by PageActive). And for simplicity, delay calling RCU until both halves are freed. Not adding back unallocated fragments to the list in pte_free_defer() can result in wasting some amount of memory for pagetables, depending on how long the allocated fragment will stay in use. In practice, this effect is expected to be insignificant, and not justify a far more complex approach, which might allow to add the fragments back later in __tlb_remove_table(), where we might not have a stable mm any more. [hughd@google.com: Claudio finds warning on mm_has_pgste() more useful than on mm_alloc_pgste()] Link: https://lkml.kernel.org/r/3bc095ba-a180-ce3b-82b1-2bfc64612f3@google.com Link: https://lkml.kernel.org/r/94eccf5f-264c-8abe-4567-e77f4b4e14a@google.com Signed-off-by: Hugh Dickins Reviewed-by: Gerald Schaefer Tested-by: Alexander Gordeev Acked-by: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 ++ arch/s390/mm/pgalloc.c | 80 ++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 12 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 17eb618f1348ad..89a9d5ef94f866 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -143,6 +143,10 @@ static inline void pmd_populate(struct mm_struct *mm, #define pte_free_kernel(mm, pte) page_table_free(mm, (unsigned long *) pte) #define pte_free(mm, pte) page_table_free(mm, (unsigned long *) pte) +/* arch use pte_free_defer() implementation in arch/s390/mm/pgalloc.c */ +#define pte_free_defer pte_free_defer +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + void vmem_map_init(void); void *vmem_crst_alloc(unsigned long val); pte_t *vmem_pte_alloc(void); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 66ab68db98428c..d7374add78209e 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -229,6 +229,15 @@ void page_table_free_pgste(struct page *page) * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable * while the PP bits are never used, nor such a page is added to or removed * from mm_context_t::pgtable_list. + * + * pte_free_defer() overrides those rules: it takes the page off pgtable_list, + * and prevents both 2K fragments from being reused. pte_free_defer() has to + * guarantee that its pgtable cannot be reused before the RCU grace period + * has elapsed (which page_table_free_rcu() does not actually guarantee). + * But for simplicity, because page->rcu_head overlays page->lru, and because + * the RCU callback might not be called before the mm_context_t has been freed, + * pte_free_defer() in this implementation prevents both fragments from being + * reused, and delays making the call to RCU until both fragments are freed. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -261,7 +270,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, 0x01U << (bit + 24)); - list_del(&page->lru); + list_del_init(&page->lru); } } spin_unlock_bh(&mm->context.lock); @@ -281,6 +290,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ + INIT_LIST_HEAD(&page->lru); atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); @@ -300,7 +310,9 @@ static void page_table_release_check(struct page *page, void *table, { char msg[128]; - if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + if (!mask && list_empty(&page->lru)) return; snprintf(msg, sizeof(msg), "Invalid pgtable %p release half 0x%02x mask 0x%02x", @@ -308,6 +320,15 @@ static void page_table_release_check(struct page *page, void *table, dump_page(page, msg); } +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pgtable_pte_page_dtor(page); + __free_page(page); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; @@ -325,10 +346,17 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to head of list, to make + * this freed half available for immediate reuse. + */ list_add(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); mask >>= 24; @@ -342,8 +370,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, @@ -370,10 +400,18 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 0x03U) + if ((mask & 0x03U) && !PageActive(page)) { + /* + * Other half is allocated, and neither half has had + * its free deferred: add page to end of list, to make + * this freed half available for reuse once its pending + * bit has been cleared by __tlb_remove_table(). + */ list_add_tail(&page->lru, &mm->context.pgtable_list); - else - list_del(&page->lru); + } else { + /* If page is on list, now remove it. */ + list_del_init(&page->lru); + } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); @@ -403,9 +441,27 @@ void __tlb_remove_table(void *_table) } page_table_release_check(page, table, half, mask); - pgtable_pte_page_dtor(page); - __free_page(page); + if (TestClearPageActive(page)) + call_rcu(&page->rcu_head, pte_free_now); + else + pte_free_now(&page->rcu_head); +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = virt_to_page(pgtable); + SetPageActive(page); + page_table_free(mm, (unsigned long *)pgtable); + /* + * page_table_free() does not do the pgste gmap_unlink() which + * page_table_free_rcu() does: warn us if pgste ever reaches here. + */ + WARN_ON_ONCE(mm_has_pgste(mm)); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* * Base infrastructure required to generate basic asces, region, segment, From 13cf577e6b66a148d6d63f5ef7801f4b61d5850f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:39:48 -0700 Subject: [PATCH 106/489] mm/pgtable: add pte_free_defer() for pgtable as page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the generic pte_free_defer(), to call pte_free() via call_rcu(). pte_free_defer() will be called inside khugepaged's retract_page_tables() loop, where allocating extra memory cannot be relied upon. This version suits all those architectures which use an unfragmented page for one page table (none of whose pte_free()s use the mm arg which was passed to it). Link: https://lkml.kernel.org/r/78e921b0-b681-a1b0-dc20-44c9efa4ef3c@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 4 ++++ include/linux/pgtable.h | 2 ++ mm/pgtable-generic.c | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 51d04c1847c113..1fc4b9c2c8a68c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -144,6 +144,10 @@ struct page { struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ + /* + * A PTE page table page might be freed by use of + * rcu_head: which overlays those two fields above. + */ unsigned long _pt_pad_2; /* mapping */ union { struct mm_struct *pt_mm; /* x86 pgds only */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 7f2db400f65342..9fa34be65159c7 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -112,6 +112,8 @@ static inline void pte_unmap(pte_t *pte) } #endif +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable); + /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index b9a0c2137cc135..fa9d4d08429124 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -13,6 +13,7 @@ #include #include #include +#include #include /* @@ -230,6 +231,25 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, return pmd; } #endif + +/* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ +#ifndef pte_free_defer +static void pte_free_now(struct rcu_head *head) +{ + struct page *page; + + page = container_of(head, struct page, rcu_head); + pte_free(NULL /* mm not passed and not used */, (pgtable_t)page); +} + +void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) +{ + struct page *page; + + page = pgtable; + call_rcu(&page->rcu_head, pte_free_now); +} +#endif /* pte_free_defer */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ From 1d65b771bc08cd054cf6d3766a72e113dc46d62f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:41:04 -0700 Subject: [PATCH 107/489] mm/khugepaged: retract_page_tables() without mmap or vma lock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify shmem and file THP collapse's retract_page_tables(), and relax its locking: to improve its success rate and to lessen impact on others. Instead of its MADV_COLLAPSE case doing set_huge_pmd() at target_addr of target_mm, leave that part of the work to madvise_collapse() calling collapse_pte_mapped_thp() afterwards: just adjust collapse_file()'s result code to arrange for that. That spares retract_page_tables() four arguments; and since it will be successful in retracting all of the page tables expected of it, no need to track and return a result code itself. It needs i_mmap_lock_read(mapping) for traversing the vma interval tree, but it does not need i_mmap_lock_write() for that: page_vma_mapped_walk() allows for pte_offset_map_lock() etc to fail, and uses pmd_lock() for THPs. retract_page_tables() just needs to use those same spinlocks to exclude it briefly, while transitioning pmd from page table to none: so restore its use of pmd_lock() inside of which pte lock is nested. Users of pte_offset_map_lock() etc all now allow for them to fail: so retract_page_tables() now has no use for mmap_write_trylock() or vma_try_start_write(). In common with rmap and page_vma_mapped_walk(), it does not even need the mmap_read_lock(). But those users do expect the page table to remain a good page table, until they unlock and rcu_read_unlock(): so the page table cannot be freed immediately, but rather by the recently added pte_free_defer(). Use the (usually a no-op) pmdp_get_lockless_sync() to send an interrupt when PAE, and pmdp_collapse_flush() did not already do so: to make sure that the start,pmdp_get_lockless(),end sequence in __pte_offset_map() cannot pick up a pmd entry with mismatched pmd_low and pmd_high. retract_page_tables() can be enhanced to replace_page_tables(), which inserts the final huge pmd without mmap lock: going through an invalid state instead of pmd_none() followed by fault. But that enhancement does raise some more questions: leave it until a later release. Link: https://lkml.kernel.org/r/f88970d9-d347-9762-ae6d-da978e8a4df@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 172 +++++++++++++++++++----------------------------- 1 file changed, 69 insertions(+), 103 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 4e707da4a83c78..8f88fd6d781d80 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1617,9 +1617,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, break; case SCAN_PMD_NONE: /* - * In MADV_COLLAPSE path, possible race with khugepaged where - * all pte entries have been removed and pmd cleared. If so, - * skip all the pte checks and just update the pmd mapping. + * All pte entries have been removed and pmd cleared. + * Skip all the pte checks and just update the pmd mapping. */ goto maybe_install_pmd; default: @@ -1750,123 +1749,88 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl mmap_write_unlock(mm); } -static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, - struct mm_struct *target_mm, - unsigned long target_addr, struct page *hpage, - struct collapse_control *cc) +static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; - int target_result = SCAN_FAIL; - i_mmap_lock_write(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { - int result = SCAN_FAIL; - struct mm_struct *mm = NULL; - unsigned long addr = 0; - pmd_t *pmd; - bool is_target = false; + struct mmu_notifier_range range; + struct mm_struct *mm; + unsigned long addr; + pmd_t *pmd, pgt_pmd; + spinlock_t *pml; + spinlock_t *ptl; + bool skipped_uffd = false; /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that - * got written to. These VMAs are likely not worth investing - * mmap_write_lock(mm) as PMD-mapping is likely to be split - * later. - * - * Note that vma->anon_vma check is racy: it can be set up after - * the check but before we took mmap_lock by the fault path. - * But page lock would prevent establishing any new ptes of the - * page, so we are safe. - * - * An alternative would be drop the check, but check that page - * table is clear before calling pmdp_collapse_flush() under - * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. It would also probably require locking - * the anon_vma. + * got written to. These VMAs are likely not worth removing + * page tables from, as PMD-mapping is likely to be split later. */ - if (READ_ONCE(vma->anon_vma)) { - result = SCAN_PAGE_ANON; - goto next; - } + if (READ_ONCE(vma->anon_vma)) + continue; + addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (addr & ~HPAGE_PMD_MASK || - vma->vm_end < addr + HPAGE_PMD_SIZE) { - result = SCAN_VMA_CHECK; - goto next; - } + vma->vm_end < addr + HPAGE_PMD_SIZE) + continue; + mm = vma->vm_mm; - is_target = mm == target_mm && addr == target_addr; - result = find_pmd_or_thp_or_none(mm, addr, &pmd); - if (result != SCAN_SUCCEED) - goto next; + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) + continue; + + if (hpage_collapse_test_exit(mm)) + continue; /* - * We need exclusive mmap_lock to retract page table. - * - * We use trylock due to lock inversion: we need to acquire - * mmap_lock while holding page lock. Fault path does it in - * reverse order. Trylock is a way to avoid deadlock. - * - * Also, it's not MADV_COLLAPSE's job to collapse other - * mappings - let khugepaged take care of them later. + * When a vma is registered with uffd-wp, we cannot recycle + * the page table because there may be pte markers installed. + * Other vmas can still have the same file mapped hugely, but + * skip this one: it will always be mapped in small page size + * for uffd-wp registered ranges. */ - result = SCAN_PTE_MAPPED_HUGEPAGE; - if ((cc->is_khugepaged || is_target) && - mmap_write_trylock(mm)) { - /* trylock for the same lock inversion as above */ - if (!vma_try_start_write(vma)) - goto unlock_next; + if (userfaultfd_wp(vma)) + continue; - /* - * Re-check whether we have an ->anon_vma, because - * collapse_and_free_pmd() requires that either no - * ->anon_vma exists or the anon_vma is locked. - * We already checked ->anon_vma above, but that check - * is racy because ->anon_vma can be populated under the - * mmap lock in read mode. - */ - if (vma->anon_vma) { - result = SCAN_PAGE_ANON; - goto unlock_next; - } - /* - * When a vma is registered with uffd-wp, we can't - * recycle the pmd pgtable because there can be pte - * markers installed. Skip it only, so the rest mm/vma - * can still have the same file mapped hugely, however - * it'll always mapped in small page size for uffd-wp - * registered ranges. - */ - if (hpage_collapse_test_exit(mm)) { - result = SCAN_ANY_PROCESS; - goto unlock_next; - } - if (userfaultfd_wp(vma)) { - result = SCAN_PTE_UFFD_WP; - goto unlock_next; - } - collapse_and_free_pmd(mm, vma, addr, pmd); - if (!cc->is_khugepaged && is_target) - result = set_huge_pmd(vma, addr, pmd, hpage); - else - result = SCAN_SUCCEED; + /* PTEs were notified when unmapped; but now for the PMD? */ + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + addr, addr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + + pml = pmd_lock(mm, pmd); + ptl = pte_lockptr(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); -unlock_next: - mmap_write_unlock(mm); - goto next; - } /* - * Calling context will handle target mm/addr. Otherwise, let - * khugepaged try again later. + * Huge page lock is still held, so normally the page table + * must remain empty; and we have already skipped anon_vma + * and userfaultfd_wp() vmas. But since the mmap_lock is not + * held, it is still possible for a racing userfaultfd_ioctl() + * to have inserted ptes or markers. Now that we hold ptlock, + * repeating the anon_vma check protects from one category, + * and repeating the userfaultfd_wp() check from another. */ - if (!is_target) { - khugepaged_add_pte_mapped_thp(mm, addr); - continue; + if (unlikely(vma->anon_vma || userfaultfd_wp(vma))) { + skipped_uffd = true; + } else { + pgt_pmd = pmdp_collapse_flush(vma, addr, pmd); + pmdp_get_lockless_sync(); + } + + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); + + mmu_notifier_invalidate_range_end(&range); + + if (!skipped_uffd) { + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, addr, pgt_pmd); + pte_free_defer(mm, pmd_pgtable(pgt_pmd)); } -next: - if (is_target) - target_result = result; } - i_mmap_unlock_write(mapping); - return target_result; + i_mmap_unlock_read(mapping); } /** @@ -2260,9 +2224,11 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, /* * Remove pte page tables, so we can re-fault the page as huge. + * If MADV_COLLAPSE, adjust result to call collapse_pte_mapped_thp(). */ - result = retract_page_tables(mapping, start, mm, addr, hpage, - cc); + retract_page_tables(mapping, start); + if (cc && !cc->is_khugepaged) + result = SCAN_PTE_MAPPED_HUGEPAGE; unlock_page(hpage); /* From 1043173eb5eb351a1dba11cca12705075fe74a9e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:42:19 -0700 Subject: [PATCH 108/489] mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp(). It does need mmap_read_lock(), but it does not need mmap_write_lock(), nor vma_start_write() nor i_mmap lock nor anon_vma lock. All racing paths are relying on pte_offset_map_lock() and pmd_lock(), so use those. Follow the pattern in retract_page_tables(); and using pte_free_defer() removes most of the need for tlb_remove_table_sync_one() here; but call pmdp_get_lockless_sync() to use it in the PAE case. First check the VMA, in case page tables are being torn down: from JannH. Confirm the preliminary find_pmd_or_thp_or_none() once page lock has been acquired and the page looks suitable: from then on its state is stable. However, collapse_pte_mapped_thp() was doing something others don't: freeing a page table still containing "valid" entries. i_mmap lock did stop a racing truncate from double-freeing those pages, but we prefer collapse_pte_mapped_thp() to clear the entries as usual. Their TLB flush can wait until the pmdp_collapse_flush() which follows, but the mmu_notifier_invalidate_range_start() has to be done earlier. Do the "step 1" checking loop without mmu_notifier: it wouldn't be good for khugepaged to keep on repeatedly invalidating a range which is then found unsuitable e.g. contains COWs. "step 2", which does the clearing, must then be more careful (after dropping ptl to do mmu_notifier), with abort prepared to correct the accounting like "step 3". But with those entries now cleared, "step 4" (after dropping ptl to do pmd_lock) is kept safe by the huge page lock, which stops new PTEs from being faulted in. [hughd@google.com: don't set mmap_locked = true in madvise_collapse()] Link: https://lkml.kernel.org/r/d3d9ff14-ef8-8f84-e160-bfa1f5794275@google.com [hughd@google.com: use ptep_clear() instead of pte_clear()] Link: https://lkml.kernel.org/r/e0197433-8a47-6a65-534d-eda26eeb78b0@google.com Link: https://lkml.kernel.org/r/b53be6a4-7715-51f9-aad-f1347dcb7c4@google.com Signed-off-by: Hugh Dickins Reviewed-by: Qi Zheng Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 170 ++++++++++++++++++++++-------------------------- 1 file changed, 76 insertions(+), 94 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8f88fd6d781d80..53d1788332cbea 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1485,7 +1485,7 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return ret; } -/* hpage must be locked, and mmap_lock must be held in write */ +/* hpage must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct page *hpage) { @@ -1497,7 +1497,7 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, }; VM_BUG_ON(!PageTransHuge(hpage)); - mmap_assert_write_locked(vma->vm_mm); + mmap_assert_locked(vma->vm_mm); if (do_set_pmd(&vmf, hpage)) return SCAN_FAIL; @@ -1506,48 +1506,6 @@ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, return SCAN_SUCCEED; } -/* - * A note about locking: - * Trying to take the page table spinlocks would be useless here because those - * are only used to synchronize: - * - * - modifying terminal entries (ones that point to a data page, not to another - * page table) - * - installing *new* non-terminal entries - * - * Instead, we need roughly the same kind of protection as free_pgtables() or - * mm_take_all_locks() (but only for a single VMA): - * The mmap lock together with this VMA's rmap locks covers all paths towards - * the page table entries we're messing with here, except for hardware page - * table walks and lockless_pages_from_mm(). - */ -static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp) -{ - pmd_t pmd; - struct mmu_notifier_range range; - - mmap_assert_write_locked(mm); - if (vma->vm_file) - lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); - /* - * All anon_vmas attached to the VMA have the same root and are - * therefore locked by the same lock. - */ - if (vma->anon_vma) - lockdep_assert_held_write(&vma->anon_vma->root->rwsem); - - mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, addr, - addr + HPAGE_PMD_SIZE); - mmu_notifier_invalidate_range_start(&range); - pmd = pmdp_collapse_flush(vma, addr, pmdp); - tlb_remove_table_sync_one(); - mmu_notifier_invalidate_range_end(&range); - mm_dec_nr_ptes(mm); - page_table_check_pte_clear_range(mm, addr, pmd); - pte_free(mm, pmd_pgtable(pmd)); -} - /** * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at * address haddr. @@ -1563,26 +1521,29 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, bool install_pmd) { + struct mmu_notifier_range range; + bool notified = false; unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd; - spinlock_t *ptl; - int count = 0, result = SCAN_FAIL; + pmd_t *pmd, pgt_pmd; + spinlock_t *pml, *ptl; + int nr_ptes = 0, result = SCAN_FAIL; int i; - mmap_assert_write_locked(mm); + mmap_assert_locked(mm); + + /* First check VMA found, in case page tables are being torn down */ + if (!vma || !vma->vm_file || + !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) + return SCAN_VMA_CHECK; /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); if (result == SCAN_PMD_MAPPED) return result; - if (!vma || !vma->vm_file || - !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return SCAN_VMA_CHECK; - /* * If we are here, we've succeeded in replacing all the native pages * in the page cache with a single hugepage. If a mm were to fault-in @@ -1612,6 +1573,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto drop_hpage; } + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); switch (result) { case SCAN_SUCCEED: break; @@ -1625,27 +1587,10 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, goto drop_hpage; } - /* Lock the vma before taking i_mmap and page table locks */ - vma_start_write(vma); - - /* - * We need to lock the mapping so that from here on, only GUP-fast and - * hardware page walks can access the parts of the page tables that - * we're operating on. - * See collapse_and_free_pmd(). - */ - i_mmap_lock_write(vma->vm_file->f_mapping); - - /* - * This spinlock should be unnecessary: Nobody else should be accessing - * the page tables under spinlock protection here, only - * lockless_pages_from_mm() and the hardware page walker can access page - * tables while all the high-level locks are held in write mode. - */ result = SCAN_FAIL; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); - if (!start_pte) - goto drop_immap; + if (!start_pte) /* mmap_lock + page lock should prevent this */ + goto drop_hpage; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1672,10 +1617,18 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, */ if (hpage + i != page) goto abort; - count++; } - /* step 2: adjust rmap */ + pte_unmap_unlock(start_pte, ptl); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, + haddr, haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + notified = true; + start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + if (!start_pte) /* mmap_lock + page lock should prevent this */ + goto abort; + + /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { struct page *page; @@ -1683,47 +1636,76 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (pte_none(ptent)) continue; + /* + * We dropped ptl after the first scan, to do the mmu_notifier: + * page lock stops more PTEs of the hpage being faulted in, but + * does not stop write faults COWing anon copies from existing + * PTEs; and does not stop those being swapped out or migrated. + */ + if (!pte_present(ptent)) { + result = SCAN_PTE_NON_PRESENT; + goto abort; + } page = vm_normal_page(vma, addr, ptent); - if (WARN_ON_ONCE(page && is_zone_device_page(page))) + if (hpage + i != page) goto abort; + + /* + * Must clear entry, or a racing truncate may re-remove it. + * TLB flush can be left until pmdp_collapse_flush() does it. + * PTE dirty? Shmem page is already dirty; file is read-only. + */ + ptep_clear(mm, addr, pte); page_remove_rmap(page, vma, false); + nr_ptes++; } pte_unmap_unlock(start_pte, ptl); /* step 3: set proper refcount and mm_counters. */ - if (count) { - page_ref_sub(hpage, count); - add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); + if (nr_ptes) { + page_ref_sub(hpage, nr_ptes); + add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); } - /* step 4: remove pte entries */ - /* we make no change to anon, but protect concurrent anon page lookup */ - if (vma->anon_vma) - anon_vma_lock_write(vma->anon_vma); + /* step 4: remove page table */ - collapse_and_free_pmd(mm, vma, haddr, pmd); + /* Huge page lock is still held, so page table must remain empty */ + pml = pmd_lock(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); + pmdp_get_lockless_sync(); + if (ptl != pml) + spin_unlock(ptl); + spin_unlock(pml); - if (vma->anon_vma) - anon_vma_unlock_write(vma->anon_vma); - i_mmap_unlock_write(vma->vm_file->f_mapping); + mmu_notifier_invalidate_range_end(&range); + + mm_dec_nr_ptes(mm); + page_table_check_pte_clear_range(mm, haddr, pgt_pmd); + pte_free_defer(mm, pmd_pgtable(pgt_pmd)); maybe_install_pmd: /* step 5: install pmd entry */ result = install_pmd ? set_huge_pmd(vma, haddr, pmd, hpage) : SCAN_SUCCEED; - + goto drop_hpage; +abort: + if (nr_ptes) { + flush_tlb_mm(mm); + page_ref_sub(hpage, nr_ptes); + add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); + } + if (start_pte) + pte_unmap_unlock(start_pte, ptl); + if (notified) + mmu_notifier_invalidate_range_end(&range); drop_hpage: unlock_page(hpage); put_page(hpage); return result; - -abort: - pte_unmap_unlock(start_pte, ptl); -drop_immap: - i_mmap_unlock_write(vma->vm_file->f_mapping); - goto drop_hpage; } static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) @@ -2856,9 +2838,9 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, case SCAN_PTE_MAPPED_HUGEPAGE: BUG_ON(mmap_locked); BUG_ON(*prev); - mmap_write_lock(mm); + mmap_read_lock(mm); result = collapse_pte_mapped_thp(mm, addr, true); - mmap_write_unlock(mm); + mmap_read_unlock(mm); goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: From d50791c2bee9ed97b1dd81db9bbb11caddcdfb0d Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:43:36 -0700 Subject: [PATCH 109/489] mm/khugepaged: delete khugepaged_collapse_pte_mapped_thps() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that retract_page_tables() can retract page tables reliably, without depending on trylocks, delete all the apparatus for khugepaged to try again later: khugepaged_collapse_pte_mapped_thps() etc; and free up the per-mm memory which was set aside for that in the khugepaged_mm_slot. But one part of that is worth keeping: when hpage_collapse_scan_file() found SCAN_PTE_MAPPED_HUGEPAGE, that address was noted in the mm_slot to be tried for retraction later - catching, for example, page tables where a reversible mprotect() of a portion had required splitting the pmd, but now it can be recollapsed. Call collapse_pte_mapped_thp() directly in this case (why was it deferred before? I assume an issue with needing mmap_lock for write, but now it's only needed for read). [hughd@google.com: fix mmap_locked handlng] Link: https://lkml.kernel.org/r/bfc6cab2-497f-32bf-dd5-98dc1987e4a9@google.com Link: https://lkml.kernel.org/r/a5dce57-6dfa-5559-4698-e817eb2f993@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/khugepaged.c | 125 ++++++------------------------------------------ 1 file changed, 15 insertions(+), 110 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 53d1788332cbea..9a6e0d50775939 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -93,8 +93,6 @@ static DEFINE_READ_MOSTLY_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct kmem_cache *mm_slot_cache __read_mostly; -#define MAX_PTE_MAPPED_THP 8 - struct collapse_control { bool is_khugepaged; @@ -108,15 +106,9 @@ struct collapse_control { /** * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned * @slot: hash lookup from mm to mm_slot - * @nr_pte_mapped_thp: number of pte mapped THP - * @pte_mapped_thp: address array corresponding pte mapped THP */ struct khugepaged_mm_slot { struct mm_slot slot; - - /* pte-mapped THP in this mm */ - int nr_pte_mapped_thp; - unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP]; }; /** @@ -1441,50 +1433,6 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) } #ifdef CONFIG_SHMEM -/* - * Notify khugepaged that given addr of the mm is pte-mapped THP. Then - * khugepaged should try to collapse the page table. - * - * Note that following race exists: - * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, - * emptying the A's ->pte_mapped_thp[] array. - * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and - * retract_page_tables() finds a VMA in mm_struct A mapping the same extent - * (at virtual address X) and adds an entry (for X) into mm_struct A's - * ->pte-mapped_thp[] array. - * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, - * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry - * (for X) into mm_struct A's ->pte-mapped_thp[] array. - * Thus, it's possible the same address is added multiple times for the same - * mm_struct. Should this happen, we'll simply attempt - * collapse_pte_mapped_thp() multiple times for the same address, under the same - * exclusive mmap_lock, and assuming the first call is successful, subsequent - * attempts will return quickly (without grabbing any additional locks) when - * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap - * check, and since this is a rare occurrence, the cost of preventing this - * "multiple-add" is thought to be more expensive than just handling it, should - * it occur. - */ -static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) -{ - struct khugepaged_mm_slot *mm_slot; - struct mm_slot *slot; - bool ret = false; - - VM_BUG_ON(addr & ~HPAGE_PMD_MASK); - - spin_lock(&khugepaged_mm_lock); - slot = mm_slot_lookup(mm_slots_hash, mm); - mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { - mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; - ret = true; - } - spin_unlock(&khugepaged_mm_lock); - return ret; -} - /* hpage must be locked, and mmap_lock must be held */ static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, struct page *hpage) @@ -1708,29 +1656,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, return result; } -static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) -{ - struct mm_slot *slot = &mm_slot->slot; - struct mm_struct *mm = slot->mm; - int i; - - if (likely(mm_slot->nr_pte_mapped_thp == 0)) - return; - - if (!mmap_write_trylock(mm)) - return; - - if (unlikely(hpage_collapse_test_exit(mm))) - goto out; - - for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); - -out: - mm_slot->nr_pte_mapped_thp = 0; - mmap_write_unlock(mm); -} - static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) { struct vm_area_struct *vma; @@ -2371,16 +2296,6 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, { BUILD_BUG(); } - -static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) -{ -} - -static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) -{ - return false; -} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, @@ -2410,7 +2325,6 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); - khugepaged_collapse_pte_mapped_thps(mm_slot); mm = slot->mm; /* @@ -2463,36 +2377,27 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, khugepaged_scan.address); mmap_read_unlock(mm); - *result = hpage_collapse_scan_file(mm, - khugepaged_scan.address, - file, pgoff, cc); mmap_locked = false; + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, file, pgoff, cc); fput(file); + if (*result == SCAN_PTE_MAPPED_HUGEPAGE) { + mmap_read_lock(mm); + if (hpage_collapse_test_exit(mm)) + goto breakouterloop; + *result = collapse_pte_mapped_thp(mm, + khugepaged_scan.address, false); + if (*result == SCAN_PMD_MAPPED) + *result = SCAN_SUCCEED; + mmap_read_unlock(mm); + } } else { *result = hpage_collapse_scan_pmd(mm, vma, - khugepaged_scan.address, - &mmap_locked, - cc); + khugepaged_scan.address, &mmap_locked, cc); } - switch (*result) { - case SCAN_PTE_MAPPED_HUGEPAGE: { - pmd_t *pmd; - - *result = find_pmd_or_thp_or_none(mm, - khugepaged_scan.address, - &pmd); - if (*result != SCAN_SUCCEED) - break; - if (!khugepaged_add_pte_mapped_thp(mm, - khugepaged_scan.address)) - break; - } fallthrough; - case SCAN_SUCCEED: + + if (*result == SCAN_SUCCEED) ++khugepaged_pages_collapsed; - break; - default: - break; - } /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; From cf95e337cb63cfbf5c9ea1a1f64f9818b979e3b3 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:48:48 -0700 Subject: [PATCH 110/489] mm: delete mmap_write_trylock() and vma_try_start_write() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mmap_write_trylock() and vma_try_start_write() were added just for khugepaged, but now it has no use for them: delete. Link: https://lkml.kernel.org/r/4e6db3d-e8e-73fb-1f2a-8de2dab2a87c@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/mm.h | 17 ----------------- include/linux/mmap_lock.h | 10 ---------- 2 files changed, 27 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d1ad22980ebe7a..bfb46483108cdd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -709,21 +709,6 @@ static inline void vma_start_write(struct vm_area_struct *vma) up_write(&vma->vm_lock->lock); } -static inline bool vma_try_start_write(struct vm_area_struct *vma) -{ - int mm_lock_seq; - - if (__is_vma_write_locked(vma, &mm_lock_seq)) - return true; - - if (!down_write_trylock(&vma->vm_lock->lock)) - return false; - - WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); - return true; -} - static inline void vma_assert_write_locked(struct vm_area_struct *vma) { int mm_lock_seq; @@ -748,8 +733,6 @@ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} -static inline bool vma_try_start_write(struct vm_area_struct *vma) - { return true; } static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index e05e167dbd166e..a5c63b6d7d46c1 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -118,16 +118,6 @@ static inline int mmap_write_lock_killable(struct mm_struct *mm) return ret; } -static inline bool mmap_write_trylock(struct mm_struct *mm) -{ - bool ret; - - __mmap_lock_trace_start_locking(mm, true); - ret = down_write_trylock(&mm->mmap_lock) != 0; - __mmap_lock_trace_acquire_returned(mm, true, ret); - return ret; -} - static inline void mmap_write_unlock(struct mm_struct *mm) { __mmap_lock_trace_released(mm, true); From 610d06576737f5401647a4aab46558c1114898fb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 11 Jul 2023 21:46:23 -0700 Subject: [PATCH 111/489] mm/pgtable: notes on pte_offset_map[_lock]() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a block of comments on pte_offset_map_lock(), pte_offset_map() and pte_offset_map_nolock() to mm/pgtable-generic.c, to help explain them. Link: https://lkml.kernel.org/r/b791c3b0-25c6-a263-d785-d564344eb644@google.com Signed-off-by: Hugh Dickins Cc: Alexander Gordeev Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Axel Rasmussen Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Claudio Imbrenda Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Huang, Ying Cc: Ira Weiny Cc: Jann Horn Cc: Jason Gunthorpe Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Miaohe Lin Cc: Michael Ellerman Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Minchan Kim Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Peter Xu Cc: Peter Zijlstra Cc: Qi Zheng Cc: Ralph Campbell Cc: Russell King Cc: SeongJae Park Cc: Song Liu Cc: Steven Price Cc: Suren Baghdasaryan Cc: Thomas Hellström Cc: Vasily Gorbik Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Yu Zhao Cc: Zack Rusin Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/pgtable-generic.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index fa9d4d08429124..4fcd959dcc4d02 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -315,6 +315,50 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, return pte; } +/* + * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation + * __pte_offset_map_lock() below, is usually called with the pmd pointer for + * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while + * holding mmap_lock or vma lock for read or for write; or in truncate or rmap + * context, while holding file's i_mmap_lock or anon_vma lock for read (or for + * write). In a few cases, it may be used with pmd pointing to a pmd_t already + * copied to or constructed on the stack. + * + * When successful, it returns the pte pointer for addr, with its page table + * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent + * modification by software, with a pointer to that spinlock in ptlp (in some + * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's + * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. + * + * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no + * page table at *pmd: if, for example, the page table has just been removed, + * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked + * after acquiring the ptlock, and retried internally if it changed: so that a + * page table can be safely removed or replaced by THP while holding its lock.) + * + * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, + * just returns the pte pointer for addr, its page table kmapped if necessary; + * or NULL if there is no page table at *pmd. It does not attempt to lock the + * page table, so cannot normally be used when the page table is to be updated, + * or when entries read must be stable. But it does take rcu_read_lock(): so + * that even when page table is racily removed, it remains a valid though empty + * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s + * afterwards. + * + * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); + * but when successful, it also outputs a pointer to the spinlock in ptlp - as + * pte_offset_map_lock() does, but in this case without locking it. This helps + * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time + * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock + * pointer for the page table that it returns. In principle, the caller should + * recheck *pmd once the lock is taken; in practice, no callsite needs that - + * either the mmap_lock for write, or pte_same() check on contents, is enough. + * + * Note that free_pgtables(), used after unmapping detached vmas, or when + * exiting the whole mm, does not take page table lock before freeing a page + * table, and may not use RCU at all: "outsiders" like khugepaged should avoid + * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. + */ pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { From 73e791d73877e90444afb6036d86ea37fd78584f Mon Sep 17 00:00:00 2001 From: Xueshi Hu Date: Wed, 12 Jul 2023 21:49:59 +0800 Subject: [PATCH 112/489] mm: remove clear_page_idle() All callers have now been converted to call folio_clear_idle(). Link: https://lkml.kernel.org/r/20230712134959.145373-1-xueshi.hu@smartx.com Signed-off-by: Xueshi Hu Reviewed-by: David Hildenbrand Cc: Charan Teja Kalla Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/page_idle.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 5cb7bd2078ecf8..d8f34484064302 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -144,9 +144,4 @@ static inline void set_page_idle(struct page *page) { folio_set_idle(page_folio(page)); } - -static inline void clear_page_idle(struct page *page) -{ - folio_clear_idle(page_folio(page)); -} #endif /* _LINUX_MM_PAGE_IDLE_H */ From 6852c46c783d20a4c0153d14d2990040e5e6e47e Mon Sep 17 00:00:00 2001 From: Yu Ma Date: Wed, 12 Jul 2023 10:57:39 -0400 Subject: [PATCH 113/489] mm/mmap: move vma operations to mm_struct out of the critical section of file mapping lock UnixBench/Execl represents a class of workload where bash scripts are spawned frequently to do some short jobs. When running multiple parallel tasks, hot osq_lock is observed from do_mmap and exit_mmap. Both of them come from load_elf_binary through the call chain "execl->do_execveat_common->bprm_execve->load_elf_binary". In do_mmap,it will call mmap_region to create vma node, initialize it and insert it to vma maintain structure in mm_struct and i_mmap tree of the mapping file, then increase map_count to record the number of vma nodes used. The hot osq_lock is to protect operations on file's i_mmap tree. For the mm_struct member change like vma insertion and map_count update, they do not affect i_mmap tree. Move those operations out of the lock's critical section, to reduce hold time on the lock. With this change, on Intel Sapphire Rapids 112C/224T platform, based on v6.0-rc6, the 160 parallel score improves by 12%. The patch has no obvious performance gain on v6.5-rc1 due to regression of this benchmark from this commit f1a7941243c102a44e8847e3b94ff4ff3ec56f25 (mm: convert mm's rss stats into percpu_counter). Related discussion and conclusion can be referred at the mail thread initiated by 0day as below: Link: https://lore.kernel.org/linux-mm/a4aa2e13-7187-600b-c628-7e8fb108def0@intel.com/ Link: https://lkml.kernel.org/r/20230712145739.604215-1-yu.ma@intel.com Signed-off-by: Yu Ma Reviewed-by: Tim Chen Cc: Dan Williams Cc: Dave Hansen Cc: Kirill A . Shutemov Cc: Liam R. Howlett Cc: Shakeel Butt Cc: Zhu, Lipeng Signed-off-by: Andrew Morton --- mm/mmap.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 3937479d0e071c..e1586b2f938e5e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -412,14 +412,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) if (vma_iter_prealloc(&vmi)) return -ENOMEM; + vma_iter_store(&vmi, vma); + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); - } - - vma_iter_store(&vmi, vma); - - if (mapping) { __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); } @@ -2812,12 +2809,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); - if (vma->vm_file) - i_mmap_lock_write(vma->vm_file->f_mapping); - vma_iter_store(&vmi, vma); mm->map_count++; if (vma->vm_file) { + i_mmap_lock_write(vma->vm_file->f_mapping); if (vma->vm_flags & VM_SHARED) mapping_allow_writable(vma->vm_file->f_mapping); From 9e130c4b000b0a3f0bf4b4c8e714bfe3d06ff4cc Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:30 +0000 Subject: [PATCH 114/489] mm/hwpoison: delete all entries before traversal in __folio_free_raw_hwp Patch series "Improve hugetlbfs read on HWPOISON hugepages", v4. Today when hardware memory is corrupted in a hugetlb hugepage, kernel leaves the hugepage in pagecache [1]; otherwise future mmap or read will suject to silent data corruption. This is implemented by returning -EIO from hugetlb_read_iter immediately if the hugepage has HWPOISON flag set. Since memory_failure already tracks the raw HWPOISON subpages in a hugepage, a natural improvement is possible: if userspace only asks for healthy subpages in the pagecache, kernel can return these data. This patchset implements this improvement. It consist of three parts. The 1st commit exports the functionality to tell if a subpage inside a hugetlb hugepage is a raw HWPOISON page. The 2nd commit teaches hugetlbfs_read_iter to return as many healthy bytes as possible. The 3rd commit properly tests this new feature. [1] commit 8625147cafaa ("hugetlbfs: don't delete error page from pagecache") This patch (of 4): Traversal on llist (e.g. llist_for_each_safe) is only safe AFTER entries are deleted from the llist. Correct the way __folio_free_raw_hwp deletes and frees raw_hwp_page entries in raw_hwp_list: first llist_del_all, then kfree within llist_for_each_safe. As of today, concurrent adding, deleting, and traversal on raw_hwp_list from hugetlb.c and/or memory-failure.c are fine with each other. Note this is guaranteed partly by the lock-free nature of llist, and partly by holding hugetlb_lock and/or mf_mutex. For example, as llist_del_all is lock-free with itself, folio_clear_hugetlb_hwpoison()s from __update_and_free_hugetlb_folio and memory_failure won't need explicit locking when freeing the raw_hwp_list. New code that manipulates raw_hwp_list must be careful to ensure the concurrency correctness. Link: https://lkml.kernel.org/r/20230713001833.3778937-1-jiaqiyan@google.com Link: https://lkml.kernel.org/r/20230713001833.3778937-2-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Mike Kravetz Acked-by: Naoya Horiguchi Cc: James Houghton Cc: Miaohe Lin Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 70f44180ef8040..9422a5770db60a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1830,12 +1830,11 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { - struct llist_head *head; - struct llist_node *t, *tnode; + struct llist_node *t, *tnode, *head; unsigned long count = 0; - head = raw_hwp_list_head(folio); - llist_for_each_safe(tnode, t, head->first) { + head = llist_del_all(raw_hwp_list_head(folio)); + llist_for_each_safe(tnode, t, head) { struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); if (move_flag) @@ -1845,7 +1844,6 @@ static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) kfree(p); count++; } - llist_del_all(head); return count; } From b79f8eb408d0468df0d6082ed958b67d94adce65 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:31 +0000 Subject: [PATCH 115/489] mm/hwpoison: check if a raw page in a hugetlb folio is raw HWPOISON Add the functionality, is_raw_hwpoison_page_in_hugepage, to tell if a raw page in a hugetlb folio is HWPOISON. This functionality relies on RawHwpUnreliable to be not set; otherwise hugepage's raw HWPOISON list becomes meaningless. is_raw_hwpoison_page_in_hugepage holds mf_mutex in order to synchronize with folio_set_hugetlb_hwpoison and folio_free_raw_hwp who iterate, insert, or delete entry in raw_hwp_list. llist itself doesn't ensure insertion and removal are synchornized with the llist_for_each_entry used by is_raw_hwpoison_page_in_hugepage (unless iterated entries are already deleted from the list). Caller can minimize the overhead of lock cycles by first checking HWPOISON flag of the folio. Exports this functionality to be immediately used in the read operation for hugetlbfs. Link: https://lkml.kernel.org/r/20230713001833.3778937-3-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Reviewed-by: Miaohe Lin Cc: James Houghton Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 5 +++++ mm/memory-failure.c | 40 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9bc3c2d71b71b7..9f4bac3df59e4b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -997,6 +997,11 @@ void hugetlb_register_node(struct node *node); void hugetlb_unregister_node(struct node *node); #endif +/* + * Check if a given raw @page in a hugepage is HWPOISON. + */ +bool is_raw_hwpoison_page_in_hugepage(struct page *page); + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9422a5770db60a..adb0dacbc74e5c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -72,6 +72,8 @@ atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0); static bool hw_memory_failure __read_mostly = false; +static DEFINE_MUTEX(mf_mutex); + void num_poisoned_pages_inc(unsigned long pfn) { atomic_long_inc(&num_poisoned_pages); @@ -1814,6 +1816,7 @@ EXPORT_SYMBOL_GPL(mf_dax_kill_procs); #endif /* CONFIG_FS_DAX */ #ifdef CONFIG_HUGETLB_PAGE + /* * Struct raw_hwp_page represents information about "raw error page", * constructing singly linked list from ->_hugetlb_hwpoison field of folio. @@ -1828,6 +1831,41 @@ static inline struct llist_head *raw_hwp_list_head(struct folio *folio) return (struct llist_head *)&folio->_hugetlb_hwpoison; } +bool is_raw_hwpoison_page_in_hugepage(struct page *page) +{ + struct llist_head *raw_hwp_head; + struct raw_hwp_page *p; + struct folio *folio = page_folio(page); + bool ret = false; + + if (!folio_test_hwpoison(folio)) + return false; + + if (!folio_test_hugetlb(folio)) + return PageHWPoison(page); + + /* + * When RawHwpUnreliable is set, kernel lost track of which subpages + * are HWPOISON. So return as if ALL subpages are HWPOISONed. + */ + if (folio_test_hugetlb_raw_hwp_unreliable(folio)) + return true; + + mutex_lock(&mf_mutex); + + raw_hwp_head = raw_hwp_list_head(folio); + llist_for_each_entry(p, raw_hwp_head->first, node) { + if (page == p->page) { + ret = true; + break; + } + } + + mutex_unlock(&mf_mutex); + + return ret; +} + static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { struct llist_node *t, *tnode, *head; @@ -2110,8 +2148,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, return rc; } -static DEFINE_MUTEX(mf_mutex); - /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page From 38c1ddbde6c6593e7c4bc17bde87232b7c577e7b Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:32 +0000 Subject: [PATCH 116/489] hugetlbfs: improve read HWPOISON hugepage When a hugepage contains HWPOISON pages, read() fails to read any byte of the hugepage and returns -EIO, although many bytes in the HWPOISON hugepage are readable. Improve this by allowing hugetlbfs_read_iter returns as many bytes as possible. For a requested range [offset, offset + len) that contains HWPOISON page, return [offset, first HWPOISON page addr); the next read attempt will fail and return -EIO. Link: https://lkml.kernel.org/r/20230713001833.3778937-4-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: James Houghton Cc: Miaohe Lin Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 57 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7b17ccfa039d81..e7611ae1e61205 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -282,6 +282,41 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } #endif +/* + * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Returns the maximum number of bytes one can read without touching the 1st raw + * HWPOISON subpage. + * + * The implementation borrows the iteration logic from copy_page_to_iter*. + */ +static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +{ + size_t n = 0; + size_t res = 0; + + /* First subpage to start the loop. */ + page += offset / PAGE_SIZE; + offset %= PAGE_SIZE; + while (1) { + if (is_raw_hwpoison_page_in_hugepage(page)) + break; + + /* Safe to read n bytes without touching HWPOISON subpage. */ + n = min(bytes, (size_t)PAGE_SIZE - offset); + res += n; + bytes -= n; + if (!bytes || !n) + break; + offset += n; + if (offset == PAGE_SIZE) { + page++; + offset = 0; + } + } + + return res; +} + /* * Support for read() - Find the page attached to f_mapping and copy out the * data. This provides functionality similar to filemap_read(). @@ -300,7 +335,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) while (iov_iter_count(to)) { struct page *page; - size_t nr, copied; + size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -328,16 +363,26 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } else { unlock_page(page); - if (PageHWPoison(page)) { - put_page(page); - retval = -EIO; - break; + if (!PageHWPoison(page)) + want = nr; + else { + /* + * Adjust how many bytes safe to read without + * touching the 1st raw HWPOISON subpage after + * offset. + */ + want = adjust_range_hwpoison(page, offset, nr); + if (want == 0) { + put_page(page); + retval = -EIO; + break; + } } /* * We have the page, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, nr, to); + copied = copy_page_to_iter(page, offset, want, to); put_page(page); } offset += copied; From ba91e7e5d15a22946e6531c898e197e128bb6634 Mon Sep 17 00:00:00 2001 From: Jiaqi Yan Date: Thu, 13 Jul 2023 00:18:33 +0000 Subject: [PATCH 117/489] selftests/mm: add tests for HWPOISON hugetlbfs read Add tests for the improvement made to read operation on HWPOISON hugetlb page with different read granularities. For each chunk size, three read scenarios are tested: 1. Simple regression test on read without HWPOISON. 2. Sequential read page by page should succeed until encounters the 1st raw HWPOISON subpage. 3. After skip a raw HWPOISON subpage by lseek, read()s always succeed. Link: https://lkml.kernel.org/r/20230713001833.3778937-5-jiaqiyan@google.com Signed-off-by: Jiaqi Yan Acked-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: James Houghton Cc: Miaohe Lin Cc: Muchun Song Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + .../selftests/mm/hugetlb-read-hwpoison.c | 322 ++++++++++++++++++ 3 files changed, 324 insertions(+) create mode 100644 tools/testing/selftests/mm/hugetlb-read-hwpoison.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 7e2a982383c00e..cdc9ce4426b95e 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -5,6 +5,7 @@ hugepage-mremap hugepage-shm hugepage-vmemmap hugetlb-madvise +hugetlb-read-hwpoison khugepaged map_hugetlb map_populate diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 66d7c07dc1773b..b7fce9073279ef 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -41,6 +41,7 @@ TEST_GEN_PROGS += gup_longterm TEST_GEN_PROGS += gup_test TEST_GEN_PROGS += hmm-tests TEST_GEN_PROGS += hugetlb-madvise +TEST_GEN_PROGS += hugetlb-read-hwpoison TEST_GEN_PROGS += hugepage-mmap TEST_GEN_PROGS += hugepage-mremap TEST_GEN_PROGS += hugepage-shm diff --git a/tools/testing/selftests/mm/hugetlb-read-hwpoison.c b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c new file mode 100644 index 00000000000000..ba6cc6f9cabcdf --- /dev/null +++ b/tools/testing/selftests/mm/hugetlb-read-hwpoison.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define PREFIX " ... " +#define ERROR_PREFIX " !!! " + +#define MAX_WRITE_READ_CHUNK_SIZE (getpagesize() * 16) +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) + +enum test_status { + TEST_PASSED = 0, + TEST_FAILED = 1, + TEST_SKIPPED = 2, +}; + +static char *status_to_str(enum test_status status) +{ + switch (status) { + case TEST_PASSED: + return "TEST_PASSED"; + case TEST_FAILED: + return "TEST_FAILED"; + case TEST_SKIPPED: + return "TEST_SKIPPED"; + default: + return "TEST_???"; + } +} + +static int setup_filemap(char *filemap, size_t len, size_t wr_chunk_size) +{ + char iter = 0; + + for (size_t offset = 0; offset < len; + offset += wr_chunk_size) { + iter++; + memset(filemap + offset, iter, wr_chunk_size); + } + + return 0; +} + +static bool verify_chunk(char *buf, size_t len, char val) +{ + size_t i; + + for (i = 0; i < len; ++i) { + if (buf[i] != val) { + printf(PREFIX ERROR_PREFIX "check fail: buf[%lu] = %u != %u\n", + i, buf[i], val); + return false; + } + } + + return true; +} + +static bool seek_read_hugepage_filemap(int fd, size_t len, size_t wr_chunk_size, + off_t offset, size_t expected) +{ + char buf[MAX_WRITE_READ_CHUNK_SIZE]; + ssize_t ret_count = 0; + ssize_t total_ret_count = 0; + char val = offset / wr_chunk_size + offset % wr_chunk_size; + + printf(PREFIX PREFIX "init val=%u with offset=0x%lx\n", val, offset); + printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", + expected); + if (lseek(fd, offset, SEEK_SET) < 0) { + perror(PREFIX ERROR_PREFIX "seek failed"); + return false; + } + + while (offset + total_ret_count < len) { + ret_count = read(fd, buf, wr_chunk_size); + if (ret_count == 0) { + printf(PREFIX PREFIX "read reach end of the file\n"); + break; + } else if (ret_count < 0) { + perror(PREFIX ERROR_PREFIX "read failed"); + break; + } + ++val; + if (!verify_chunk(buf, ret_count, val)) + return false; + + total_ret_count += ret_count; + } + printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", + total_ret_count); + + return total_ret_count == expected; +} + +static bool read_hugepage_filemap(int fd, size_t len, + size_t wr_chunk_size, size_t expected) +{ + char buf[MAX_WRITE_READ_CHUNK_SIZE]; + ssize_t ret_count = 0; + ssize_t total_ret_count = 0; + char val = 0; + + printf(PREFIX PREFIX "expect to read 0x%lx bytes of data in total\n", + expected); + while (total_ret_count < len) { + ret_count = read(fd, buf, wr_chunk_size); + if (ret_count == 0) { + printf(PREFIX PREFIX "read reach end of the file\n"); + break; + } else if (ret_count < 0) { + perror(PREFIX ERROR_PREFIX "read failed"); + break; + } + ++val; + if (!verify_chunk(buf, ret_count, val)) + return false; + + total_ret_count += ret_count; + } + printf(PREFIX PREFIX "actually read 0x%lx bytes of data in total\n", + total_ret_count); + + return total_ret_count == expected; +} + +static enum test_status +test_hugetlb_read(int fd, size_t len, size_t wr_chunk_size) +{ + enum test_status status = TEST_SKIPPED; + char *filemap = NULL; + + if (ftruncate(fd, len) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate failed"); + return status; + } + + filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (filemap == MAP_FAILED) { + perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + goto done; + } + + setup_filemap(filemap, len, wr_chunk_size); + status = TEST_FAILED; + + if (read_hugepage_filemap(fd, len, wr_chunk_size, len)) + status = TEST_PASSED; + + munmap(filemap, len); +done: + if (ftruncate(fd, 0) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + status = TEST_FAILED; + } + + return status; +} + +static enum test_status +test_hugetlb_read_hwpoison(int fd, size_t len, size_t wr_chunk_size, + bool skip_hwpoison_page) +{ + enum test_status status = TEST_SKIPPED; + char *filemap = NULL; + char *hwp_addr = NULL; + const unsigned long pagesize = getpagesize(); + + if (ftruncate(fd, len) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate failed"); + return status; + } + + filemap = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_POPULATE, fd, 0); + if (filemap == MAP_FAILED) { + perror(PREFIX ERROR_PREFIX "mmap for primary mapping failed"); + goto done; + } + + setup_filemap(filemap, len, wr_chunk_size); + status = TEST_FAILED; + + /* + * Poisoned hugetlb page layout (assume hugepagesize=2MB): + * |<---------------------- 1MB ---------------------->| + * |<---- healthy page ---->|<---- HWPOISON page ----->| + * |<------------------- (1MB - 8KB) ----------------->| + */ + hwp_addr = filemap + len / 2 + pagesize; + if (madvise(hwp_addr, pagesize, MADV_HWPOISON) < 0) { + perror(PREFIX ERROR_PREFIX "MADV_HWPOISON failed"); + goto unmap; + } + + if (!skip_hwpoison_page) { + /* + * Userspace should be able to read (1MB + 1 page) from + * the beginning of the HWPOISONed hugepage. + */ + if (read_hugepage_filemap(fd, len, wr_chunk_size, + len / 2 + pagesize)) + status = TEST_PASSED; + } else { + /* + * Userspace should be able to read (1MB - 2 pages) from + * HWPOISONed hugepage. + */ + if (seek_read_hugepage_filemap(fd, len, wr_chunk_size, + len / 2 + MAX(2 * pagesize, wr_chunk_size), + len / 2 - MAX(2 * pagesize, wr_chunk_size))) + status = TEST_PASSED; + } + +unmap: + munmap(filemap, len); +done: + if (ftruncate(fd, 0) < 0) { + perror(PREFIX ERROR_PREFIX "ftruncate back to 0 failed"); + status = TEST_FAILED; + } + + return status; +} + +static int create_hugetlbfs_file(struct statfs *file_stat) +{ + int fd; + + fd = memfd_create("hugetlb_tmp", MFD_HUGETLB); + if (fd < 0) { + perror(PREFIX ERROR_PREFIX "could not open hugetlbfs file"); + return -1; + } + + memset(file_stat, 0, sizeof(*file_stat)); + if (fstatfs(fd, file_stat)) { + perror(PREFIX ERROR_PREFIX "fstatfs failed"); + goto close; + } + if (file_stat->f_type != HUGETLBFS_MAGIC) { + printf(PREFIX ERROR_PREFIX "not hugetlbfs file\n"); + goto close; + } + + return fd; +close: + close(fd); + return -1; +} + +int main(void) +{ + int fd; + struct statfs file_stat; + enum test_status status; + /* Test read() in different granularity. */ + size_t wr_chunk_sizes[] = { + getpagesize() / 2, getpagesize(), + getpagesize() * 2, getpagesize() * 4 + }; + size_t i; + + for (i = 0; i < ARRAY_SIZE(wr_chunk_sizes); ++i) { + printf("Write/read chunk size=0x%lx\n", + wr_chunk_sizes[i]); + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + goto create_failure; + printf(PREFIX "HugeTLB read regression test...\n"); + status = test_hugetlb_read(fd, file_stat.f_bsize, + wr_chunk_sizes[i]); + printf(PREFIX "HugeTLB read regression test...%s\n", + status_to_str(status)); + close(fd); + if (status == TEST_FAILED) + return -1; + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + goto create_failure; + printf(PREFIX "HugeTLB read HWPOISON test...\n"); + status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, + wr_chunk_sizes[i], false); + printf(PREFIX "HugeTLB read HWPOISON test...%s\n", + status_to_str(status)); + close(fd); + if (status == TEST_FAILED) + return -1; + + fd = create_hugetlbfs_file(&file_stat); + if (fd < 0) + goto create_failure; + printf(PREFIX "HugeTLB seek then read HWPOISON test...\n"); + status = test_hugetlb_read_hwpoison(fd, file_stat.f_bsize, + wr_chunk_sizes[i], true); + printf(PREFIX "HugeTLB seek then read HWPOISON test...%s\n", + status_to_str(status)); + close(fd); + if (status == TEST_FAILED) + return -1; + } + + return 0; + +create_failure: + printf(ERROR_PREFIX "Abort test: failed to create hugetlbfs file\n"); + return -1; +} From 5ba72b4d063520b1bbe00f78dcdb726d486f96d6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Jul 2023 20:05:57 +0800 Subject: [PATCH 118/489] mm/huge_memory: use RMAP_NONE when calling page_add_anon_rmap() It's more convenient and readable to use RMAP_NONE instead of false when calling page_add_anon_rmap(). No functional change intended. Link: https://lkml.kernel.org/r/20230713120557.218592-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9f3109ed7351fb..762be2f4244cd9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2255,7 +2255,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pte_mksoft_dirty(entry); if (uffd_wp) entry = pte_mkuffd_wp(entry); - page_add_anon_rmap(page + i, vma, addr, false); + page_add_anon_rmap(page + i, vma, addr, RMAP_NONE); } VM_BUG_ON(!pte_none(ptep_get(pte))); set_pte_at(mm, addr, pte, entry); From f4d005af5b5499ddc71bbe704f2a6afa06dd3788 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 13 Jul 2023 20:14:32 +0800 Subject: [PATCH 119/489] mm/memcg: fix obsolete comment above MEM_CGROUP_MAX_RECLAIM_LOOPS Since commit 5660048ccac8 ("mm: move memcg hierarchy reclaim to generic reclaim code"), mem_cgroup_hierarchical_reclaim() is already renamed to mem_cgroup_soft_reclaim(). Update the corresponding comment. Link: https://lkml.kernel.org/r/20230713121432.273381-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Muchun Song Cc: Johannes Weiner Cc: Miaohe Lin Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3eaeb69ef9f5f9..93e3cc581b51d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -197,7 +197,7 @@ static struct move_charge_struct { }; /* - * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * Maximum loops in mem_cgroup_soft_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 From 34c876ce5eeda6c7aa89b2068e724cf84f409ebb Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:29 +0800 Subject: [PATCH 120/489] mm/page_table_check: remove unused parameters in page_table_check_clear() Patch series "Remove unused parameters in page_table_check". This series remove unused parameters in functions from page_table_check. The first 2 patches remove unused mm and addr parameters in static common functions page_table_check_clear and page_table_check_set. The last 6 patches remove unused addr parameter in some externed functions which only need addr for cleaned page_table_check_clear or page_table_check_set. There is no intended functional change. This patch (of 8): Remove unused mm and addr in function page_table_check_clear(). Link: https://lkml.kernel.org/r/20230713172636.1705415-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230713172636.1705415-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 93ec7690a0d89e..9477b93d8463a7 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -58,8 +58,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ -static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, - unsigned long pfn, unsigned long pgcnt) +static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) { struct page_ext *page_ext; struct page *page; @@ -158,8 +157,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, return; if (pte_user_accessible_page(pte)) { - page_table_check_clear(mm, addr, pte_pfn(pte), - PAGE_SIZE >> PAGE_SHIFT); + page_table_check_clear(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pte_clear); @@ -171,8 +169,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, return; if (pmd_user_accessible_page(pmd)) { - page_table_check_clear(mm, addr, pmd_pfn(pmd), - PMD_SIZE >> PAGE_SHIFT); + page_table_check_clear(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pmd_clear); @@ -184,8 +181,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, return; if (pud_user_accessible_page(pud)) { - page_table_check_clear(mm, addr, pud_pfn(pud), - PUD_SIZE >> PAGE_SHIFT); + page_table_check_clear(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT); } } EXPORT_SYMBOL(__page_table_check_pud_clear); From 2f933eaf5bbf49b71319549464df44b87074a8ac Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:30 +0800 Subject: [PATCH 121/489] mm/page_table_check: remove unused parameters in page_table_check_set() Remove unused mm and addr in page_table_check_set(). Link: https://lkml.kernel.org/r/20230713172636.1705415-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 9477b93d8463a7..53a9a1e4f34232 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -94,8 +94,7 @@ static void page_table_check_clear(unsigned long pfn, unsigned long pgcnt) * verify that it is of correct type and is not being mapped with a different * type to a different process. */ -static void page_table_check_set(struct mm_struct *mm, unsigned long addr, - unsigned long pfn, unsigned long pgcnt, +static void page_table_check_set(unsigned long pfn, unsigned long pgcnt, bool rw) { struct page_ext *page_ext; @@ -194,8 +193,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { - page_table_check_set(mm, addr, pte_pfn(pte), - PAGE_SIZE >> PAGE_SHIFT, + page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, pte_write(pte)); } } @@ -209,8 +207,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pmd_clear(mm, addr, *pmdp); if (pmd_user_accessible_page(pmd)) { - page_table_check_set(mm, addr, pmd_pfn(pmd), - PMD_SIZE >> PAGE_SHIFT, + page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); } } @@ -224,8 +221,7 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, __page_table_check_pud_clear(mm, addr, *pudp); if (pud_user_accessible_page(pud)) { - page_table_check_set(mm, addr, pud_pfn(pud), - PUD_SIZE >> PAGE_SHIFT, + page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); } } From aa232204c4689427cefa55fe975692b57291523a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:31 +0800 Subject: [PATCH 122/489] mm/page_table_check: remove unused parameter in [__]page_table_check_pte_clear Remove unused addr in page_table_check_pte_clear and __page_table_check_pte_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 7 +++---- 6 files changed, 12 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index e8a252e62b1265..f7ea51f9c1c1ef 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -928,7 +928,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 75970ee2bda223..5e07312cd3e189 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -529,7 +529,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0)); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5700bb3379877e..5085e838b860e2 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1068,7 +1068,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); return pte; } @@ -1084,7 +1084,7 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); - page_table_check_pte_clear(mm, addr, pte); + page_table_check_pte_clear(mm, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 01e16c7696ec98..35c53c4b94d347 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -14,8 +14,7 @@ extern struct static_key_true page_table_check_disabled; extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte); +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, @@ -46,13 +45,12 @@ static inline void page_table_check_free(struct page *page, unsigned int order) __page_table_check_zero(page, order); } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_clear(mm, addr, pte); + __page_table_check_pte_clear(mm, pte); } static inline void page_table_check_pmd_clear(struct mm_struct *mm, @@ -123,8 +121,7 @@ static inline void page_table_check_free(struct page *page, unsigned int order) { } -static inline void page_table_check_pte_clear(struct mm_struct *mm, - unsigned long addr, pte_t pte) +static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 9fa34be65159c7..a1ccb13c4853b8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -322,7 +322,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, { pte_t pte = ptep_get(ptep); pte_clear(mm, address, ptep); - page_table_check_pte_clear(mm, address, pte); + page_table_check_pte_clear(mm, pte); return pte; } #endif diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 53a9a1e4f34232..a1015fc4d0454d 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -149,8 +149,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) page_ext_put(page_ext); } -void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t pte) +void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { if (&init_mm == mm) return; @@ -191,7 +190,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); + __page_table_check_pte_clear(mm, ptep_get(ptep)); if (pte_user_accessible_page(pte)) { page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, pte_write(pte)); @@ -241,7 +240,7 @@ void __page_table_check_pte_clear_range(struct mm_struct *mm, if (WARN_ON(!ptep)) return; for (i = 0; i < PTRS_PER_PTE; i++) { - __page_table_check_pte_clear(mm, addr, ptep_get(ptep)); + __page_table_check_pte_clear(mm, ptep_get(ptep)); addr += PAGE_SIZE; ptep++; } From 1831414cd729a34af937d56ad684a66599de6344 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:32 +0800 Subject: [PATCH 123/489] mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_clear Remove unused addr in page_table_check_pmd_clear and __page_table_check_pmd_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 ++--- 6 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f7ea51f9c1c1ef..6e3387ec6013a2 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -940,7 +940,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 5e07312cd3e189..388c3af8a9f913 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -742,7 +742,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, { pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0)); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5085e838b860e2..5d71f933d93340 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1133,7 +1133,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long { pmd_t pmd = native_pmdp_get_and_clear(pmdp); - page_table_check_pmd_clear(mm, addr, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 35c53c4b94d347..0f777bca5283d5 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -15,8 +15,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd); +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, @@ -53,13 +52,12 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) __page_table_check_pte_clear(mm, pte); } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_clear(mm, addr, pmd); + __page_table_check_pmd_clear(mm, pmd); } static inline void page_table_check_pud_clear(struct mm_struct *mm, @@ -125,8 +123,7 @@ static inline void page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) { } -static inline void page_table_check_pmd_clear(struct mm_struct *mm, - unsigned long addr, pmd_t pmd) +static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index a1ccb13c4853b8..3edef5ed008fd4 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -425,7 +425,7 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, pmd_t pmd = *pmdp; pmd_clear(pmdp); - page_table_check_pmd_clear(mm, address, pmd); + page_table_check_pmd_clear(mm, pmd); return pmd; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index a1015fc4d0454d..51f2274c0a202d 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -160,8 +160,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_clear); -void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, - pmd_t pmd) +void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { if (&init_mm == mm) return; @@ -204,7 +203,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pmd_clear(mm, addr, *pmdp); + __page_table_check_pmd_clear(mm, *pmdp); if (pmd_user_accessible_page(pmd)) { page_table_check_set(pmd_pfn(pmd), PMD_SIZE >> PAGE_SHIFT, pmd_write(pmd)); From 931c38e16499a057e30a3033f4d6a9c242f0f156 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:33 +0800 Subject: [PATCH 124/489] mm/page_table_check: remove unused parameter in [__]page_table_check_pud_clear Remove unused addr in __page_table_check_pud_clear and page_table_check_pud_clear. Link: https://lkml.kernel.org/r/20230713172636.1705415-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- include/linux/pgtable.h | 2 +- mm/page_table_check.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5d71f933d93340..f07c610c345878 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1144,7 +1144,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, { pud_t pud = native_pudp_get_and_clear(pudp); - page_table_check_pud_clear(mm, addr, pud); + page_table_check_pud_clear(mm, pud); return pud; } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 0f777bca5283d5..5c9dc848a1bc07 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -16,8 +16,7 @@ extern struct page_ext_operations page_table_check_ops; void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); -void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, - pud_t pud); +void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, @@ -60,13 +59,12 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) __page_table_check_pmd_clear(mm, pmd); } -static inline void page_table_check_pud_clear(struct mm_struct *mm, - unsigned long addr, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_clear(mm, addr, pud); + __page_table_check_pud_clear(mm, pud); } static inline void page_table_check_pte_set(struct mm_struct *mm, @@ -127,8 +125,7 @@ static inline void page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) { } -static inline void page_table_check_pud_clear(struct mm_struct *mm, - unsigned long addr, pud_t pud) +static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 3edef5ed008fd4..5f36c055794bed 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -438,7 +438,7 @@ static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, pud_t pud = *pudp; pud_clear(pudp); - page_table_check_pud_clear(mm, address, pud); + page_table_check_pud_clear(mm, pud); return pud; } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 51f2274c0a202d..2643135bf45c2b 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -171,8 +171,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_clear); -void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, - pud_t pud) +void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { if (&init_mm == mm) return; @@ -217,7 +216,7 @@ void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, if (&init_mm == mm) return; - __page_table_check_pud_clear(mm, addr, *pudp); + __page_table_check_pud_clear(mm, *pudp); if (pud_user_accessible_page(pud)) { page_table_check_set(pud_pfn(pud), PUD_SIZE >> PAGE_SHIFT, pud_write(pud)); From 1066293d426d3000793c3c3b4276ef38b63ada4a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:34 +0800 Subject: [PATCH 125/489] mm/page_table_check: remove unused parameter in [__]page_table_check_pte_set Remove unused addr in __page_table_check_pte_set and page_table_check_pte_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6e3387ec6013a2..f0a8dcbca04aa8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 388c3af8a9f913..90063afe8d3650 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, addr, ptep, pteval); + page_table_check_pte_set(mm, ptep, pteval); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f07c610c345878..d14f0d92f04b55 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, addr, ptep, pte); + page_table_check_pte_set(mm, ptep, pte); set_pte(ptep, pte); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 5c9dc848a1bc07..63ebd9fcf28b79 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,8 +17,7 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte); +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, @@ -67,14 +66,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, addr, ptep, pte); + __page_table_check_pte_set(mm, ptep, pte); } static inline void page_table_check_pmd_set(struct mm_struct *mm, @@ -129,8 +127,7 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, +static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 2643135bf45c2b..fc20ddc3a63e35 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -182,8 +182,7 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); -void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) { if (&init_mm == mm) return; From a3b837130b5865521fa8662aceaa6ebc8d29389a Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:35 +0800 Subject: [PATCH 126/489] mm/page_table_check: remove unused parameter in [__]page_table_check_pmd_set Remove unused addr in __page_table_check_pmd_set and page_table_check_pmd_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 4 ++-- arch/riscv/include/asm/pgtable.h | 4 ++-- arch/x86/include/asm/pgtable.h | 4 ++-- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 11 insertions(+), 15 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f0a8dcbca04aa8..1fbf8d3f42b1b6 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -524,7 +524,7 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -976,7 +976,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); } #endif diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 90063afe8d3650..a30658b2611bdf 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -687,7 +687,7 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); } @@ -758,7 +758,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd))); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d14f0d92f04b55..9cc26cb0bc9f90 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1030,7 +1030,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(mm, addr, pmdp, pmd); + page_table_check_pmd_set(mm, pmdp, pmd); set_pmd(pmdp, pmd); } @@ -1167,7 +1167,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { - page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); + page_table_check_pmd_set(vma->vm_mm, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 63ebd9fcf28b79..dd58dfb0e64360 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -18,8 +18,7 @@ void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -75,14 +74,13 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, __page_table_check_pte_set(mm, ptep, pte); } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pmd_set(mm, addr, pmdp, pmd); + __page_table_check_pmd_set(mm, pmdp, pmd); } static inline void page_table_check_pud_set(struct mm_struct *mm, @@ -132,8 +130,7 @@ static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, { } -static inline void page_table_check_pmd_set(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp, +static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index fc20ddc3a63e35..033956704a6404 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -195,8 +195,7 @@ void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) } EXPORT_SYMBOL(__page_table_check_pte_set); -void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) +void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { if (&init_mm == mm) return; From 6d144436d954311f2dbacb5bf7b084042448d83e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 01:26:36 +0800 Subject: [PATCH 127/489] mm/page_table_check: remove unused parameter in [__]page_table_check_pud_set Remove unused addr in __page_table_check_pud_set and page_table_check_pud_set. Link: https://lkml.kernel.org/r/20230713172636.1705415-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Pavel Tatashin Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 11 ++++------- mm/page_table_check.c | 3 +-- 5 files changed, 8 insertions(+), 12 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1fbf8d3f42b1b6..fe4b913589eed4 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -531,7 +531,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index a30658b2611bdf..44377f0d7c35e7 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -694,7 +694,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 9cc26cb0bc9f90..ada1bbf1296129 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1037,7 +1037,7 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { - page_table_check_pud_set(mm, addr, pudp, pud); + page_table_check_pud_set(mm, pudp, pud); native_set_pud(pudp, pud); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index dd58dfb0e64360..7f6b9bf926c5d8 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -19,8 +19,7 @@ void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud); +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, unsigned long addr, pmd_t pmd); @@ -83,14 +82,13 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, __page_table_check_pmd_set(mm, pmdp, pmd); } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pud_set(mm, addr, pudp, pud); + __page_table_check_pud_set(mm, pudp, pud); } static inline void page_table_check_pte_clear_range(struct mm_struct *mm, @@ -135,8 +133,7 @@ static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, { } -static inline void page_table_check_pud_set(struct mm_struct *mm, - unsigned long addr, pud_t *pudp, +static inline void page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 033956704a6404..84c8163984e5ce 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -208,8 +208,7 @@ void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) } EXPORT_SYMBOL(__page_table_check_pmd_set); -void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud) { if (&init_mm == mm) return; From b23d03ef7af567929bcd3fca7eea6f4f347387d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:06 +0100 Subject: [PATCH 128/489] highmem: add memcpy_to_folio() and memcpy_from_folio() Patch series "More filesystem folio conversions for 6.6". Remove the only spots in affs which actually use a struct page; there are a few places where one is mentioned, but it's part of the interface. The rest of this is removing the remaining calls to set_bh_page(), and then removing the function before any new users show up. This patch (of 7): These are the folio equivalent of memcpy_to_page() and memcpy_from_page(). [agruenba@redhat.com: use correct chunk size in memcpy()] Link: https://lkml.kernel.org/r/20230802144354.1023099-1-agruenba@redhat.com Link: https://lkml.kernel.org/r/20230713035512.4139457-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230713035512.4139457-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Sterba Cc: Jan Kara Cc: Konstantin Komarov Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Christian Brauner Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton --- include/linux/highmem.h | 44 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 68da30625a6c1c..99c474de800ddc 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -439,6 +439,50 @@ static inline void memzero_page(struct page *page, size_t offset, size_t len) kunmap_local(addr); } +static inline void memcpy_from_folio(char *to, struct folio *folio, + size_t offset, size_t len) +{ + VM_BUG_ON(offset + len > folio_size(folio)); + + do { + const char *from = kmap_local_folio(folio, offset); + size_t chunk = len; + + if (folio_test_highmem(folio) && + chunk > PAGE_SIZE - offset_in_page(offset)) + chunk = PAGE_SIZE - offset_in_page(offset); + memcpy(to, from, chunk); + kunmap_local(from); + + from += chunk; + offset += chunk; + len -= chunk; + } while (len > 0); +} + +static inline void memcpy_to_folio(struct folio *folio, size_t offset, + const char *from, size_t len) +{ + VM_BUG_ON(offset + len > folio_size(folio)); + + do { + char *to = kmap_local_folio(folio, offset); + size_t chunk = len; + + if (folio_test_highmem(folio) && + chunk > PAGE_SIZE - offset_in_page(offset)) + chunk = PAGE_SIZE - offset_in_page(offset); + memcpy(to, from, chunk); + kunmap_local(to); + + from += chunk; + offset += chunk; + len -= chunk; + } while (len > 0); + + flush_dcache_folio(folio); +} + /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. From 41a638a1b3fca4ceb46c7a7b8372f43f38603297 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:07 +0100 Subject: [PATCH 129/489] affs: convert affs_symlink_read_folio() to use the folio Remove use of the old page APIs. That includes use of setting PageError on error; simply not setting the uptodate flag is sufficient. Link: https://lkml.kernel.org/r/20230713035512.4139457-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Sterba Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Tom Rix Signed-off-by: Andrew Morton --- fs/affs/symlink.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 31d6446dc16689..094aec8d17b88b 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -13,10 +13,9 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; struct buffer_head *bh; - struct inode *inode = page->mapping->host; - char *link = page_address(page); + struct inode *inode = folio->mapping->host; + char *link = folio_address(folio); struct slink_front *lf; int i, j; char c; @@ -58,12 +57,11 @@ static int affs_symlink_read_folio(struct file *file, struct folio *folio) } link[i] = '\0'; affs_brelse(bh); - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; fail: - SetPageError(page); - unlock_page(page); + folio_unlock(folio); return -EIO; } From 341130265c81e05cdbd0fecdb8ffb6db1fff693e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:08 +0100 Subject: [PATCH 130/489] affs: convert data read and write to use folios We still need to convert to/from folios in write_begin & write_end to fit the API, but this removes a lot of calls to old page-based functions, removing many hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230713035512.4139457-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Acked-by: David Sterba Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: "Theodore Ts'o" Cc: Tom Rix Signed-off-by: Andrew Morton --- fs/affs/file.c | 77 +++++++++++++++++++++++++------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) diff --git a/fs/affs/file.c b/fs/affs/file.c index e43f2f007ac1a5..705e227ff63d06 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -520,21 +520,20 @@ affs_getemptyblk_ino(struct inode *inode, int block) return ERR_PTR(err); } -static int -affs_do_readpage_ofs(struct page *page, unsigned to, int create) +static int affs_do_read_folio_ofs(struct folio *folio, size_t to, int create) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; - unsigned pos = 0; - u32 bidx, boff, bsize; + size_t pos = 0; + size_t bidx, boff, bsize; u32 tmp; - pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, - page->index, to); - BUG_ON(to > PAGE_SIZE); + pr_debug("%s(%lu, %ld, 0, %zu)\n", __func__, inode->i_ino, + folio->index, to); + BUG_ON(to > folio_size(folio)); bsize = AFFS_SB(sb)->s_data_blksize; - tmp = page->index << PAGE_SHIFT; + tmp = folio_pos(folio); bidx = tmp / bsize; boff = tmp % bsize; @@ -544,7 +543,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to, int create) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); - memcpy_to_page(page, pos, AFFS_DATA(bh) + boff, tmp); + memcpy_to_folio(folio, pos, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; pos += tmp; @@ -624,25 +623,23 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) return PTR_ERR(bh); } -static int -affs_read_folio_ofs(struct file *file, struct folio *folio) +static int affs_read_folio_ofs(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *inode = page->mapping->host; - u32 to; + struct inode *inode = folio->mapping->host; + size_t to; int err; - pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index); - to = PAGE_SIZE; - if (((page->index + 1) << PAGE_SHIFT) > inode->i_size) { - to = inode->i_size & ~PAGE_MASK; - memset(page_address(page) + to, 0, PAGE_SIZE - to); + pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, folio->index); + to = folio_size(folio); + if (folio_pos(folio) + to > inode->i_size) { + to = inode->i_size - folio_pos(folio); + folio_zero_segment(folio, to, folio_size(folio)); } - err = affs_do_readpage_ofs(page, to, 0); + err = affs_do_read_folio_ofs(folio, to, 0); if (!err) - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return err; } @@ -651,7 +648,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - struct page *page; + struct folio *folio; pgoff_t index; int err = 0; @@ -667,19 +664,20 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; - *pagep = page; + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); + *pagep = &folio->page; - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); + err = affs_do_read_folio_ofs(folio, folio_size(folio), 1); if (err) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return err; } @@ -688,6 +686,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { + struct folio *folio = page_folio(page); struct inode *inode = mapping->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; @@ -701,18 +700,18 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but - * we don't have to, because the page should always be uptodate here, + * we don't have to, because the folio should always be uptodate here, * due to write_begin. */ pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos, pos + len); bsize = AFFS_SB(sb)->s_data_blksize; - data = page_address(page); + data = folio_address(folio); bh = NULL; written = 0; - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; bidx = tmp / bsize; boff = tmp % bsize; if (boff) { @@ -804,11 +803,11 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, from += tmp; bidx++; } - SetPageUptodate(page); + folio_mark_uptodate(folio); done: affs_brelse(bh); - tmp = (page->index << PAGE_SHIFT) + from; + tmp = (folio->index << PAGE_SHIFT) + from; if (tmp > inode->i_size) inode->i_size = AFFS_I(inode)->mmu_private = tmp; @@ -819,8 +818,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, } err_first_bh: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return written; From d5db4f9df9397d398256a2e33ad63c39c213b990 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:09 +0100 Subject: [PATCH 131/489] migrate: use folio_set_bh() instead of set_bh_page() This function was converted before folio_set_bh() existed. Catch up to the new API. Link: https://lkml.kernel.org/r/20230713035512.4139457-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: David Sterba Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Tom Rix Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index e9821e245e7036..e21d5a7e7447ec 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -773,7 +773,7 @@ static int __buffer_migrate_folio(struct address_space *mapping, bh = head; do { - set_bh_page(bh, &dst->page, bh_offset(bh)); + folio_set_bh(bh, dst, bh_offset(bh)); bh = bh->b_this_page; } while (bh != head); From 07811230c3cd820a88c3429c646d3fa60aac65e5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:10 +0100 Subject: [PATCH 132/489] ntfs3: convert ntfs_get_block_vbo() to use a folio Remove a user of set_bh_page(). Link: https://lkml.kernel.org/r/20230713035512.4139457-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Tom Rix Cc: Alexander Viro Cc: Christian Brauner Cc: David Sterba Cc: Jan Kara Cc: Pankaj Raghav Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton --- fs/ntfs3/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index dc7e7ab701c6a7..8ae572aacc692b 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -554,7 +554,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); - struct page *page = bh->b_page; + struct folio *folio = bh->b_folio; u8 cluster_bits = sbi->cluster_bits; u32 block_size = sb->s_blocksize; u64 bytes, lbo, valid; @@ -569,7 +569,7 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, if (is_resident(ni)) { ni_lock(ni); - err = attr_data_read_resident(ni, page); + err = attr_data_read_resident(ni, &folio->page); ni_unlock(ni); if (!err) @@ -642,17 +642,17 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, */ bytes = block_size; - if (page) { + if (folio) { u32 voff = valid - vbo; bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); - set_bh_page(bh, page, off); + folio_set_bh(bh, folio, off); err = bh_read(bh, 0); if (err < 0) goto out; - zero_user_segment(page, off + voff, off + block_size); + folio_zero_segment(folio, off + voff, off + block_size); } } From 8147c4c4546f9f05ef03bb839b741473b28bb560 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:11 +0100 Subject: [PATCH 133/489] jbd2: use a folio in jbd2_journal_write_metadata_buffer() The primary goal here is removing the use of set_bh_page(). Take the opportunity to switch from kmap_atomic() to kmap_local(). This simplifies the function as the offset is already added to the pointer. Link: https://lkml.kernel.org/r/20230713035512.4139457-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Christian Brauner Cc: David Sterba Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pankaj Raghav Cc: Tom Rix Signed-off-by: Andrew Morton --- fs/jbd2/journal.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index fbce16fedaa4a8..1b5a45ab62b0d1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -341,7 +341,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, int do_escape = 0; char *mapped_data; struct buffer_head *new_bh; - struct page *new_page; + struct folio *new_folio; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); journal_t *journal = transaction->t_journal; @@ -370,14 +370,14 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ if (jh_in->b_frozen_data) { done_copy_out = 1; - new_page = virt_to_page(jh_in->b_frozen_data); - new_offset = offset_in_page(jh_in->b_frozen_data); + new_folio = virt_to_folio(jh_in->b_frozen_data); + new_offset = offset_in_folio(new_folio, jh_in->b_frozen_data); } else { - new_page = jh2bh(jh_in)->b_page; - new_offset = offset_in_page(jh2bh(jh_in)->b_data); + new_folio = jh2bh(jh_in)->b_folio; + new_offset = offset_in_folio(new_folio, jh2bh(jh_in)->b_data); } - mapped_data = kmap_atomic(new_page); + mapped_data = kmap_local_folio(new_folio, new_offset); /* * Fire data frozen trigger if data already wasn't frozen. Do this * before checking for escaping, as the trigger may modify the magic @@ -385,18 +385,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, * data in the buffer. */ if (!done_copy_out) - jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset, + jbd2_buffer_frozen_trigger(jh_in, mapped_data, jh_in->b_triggers); /* * Check for escaping */ - if (*((__be32 *)(mapped_data + new_offset)) == - cpu_to_be32(JBD2_MAGIC_NUMBER)) { + if (*((__be32 *)mapped_data) == cpu_to_be32(JBD2_MAGIC_NUMBER)) { need_copy_out = 1; do_escape = 1; } - kunmap_atomic(mapped_data); + kunmap_local(mapped_data); /* * Do we need to do a data copy? @@ -417,12 +416,10 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, } jh_in->b_frozen_data = tmp; - mapped_data = kmap_atomic(new_page); - memcpy(tmp, mapped_data + new_offset, bh_in->b_size); - kunmap_atomic(mapped_data); + memcpy_from_folio(tmp, new_folio, new_offset, bh_in->b_size); - new_page = virt_to_page(tmp); - new_offset = offset_in_page(tmp); + new_folio = virt_to_folio(tmp); + new_offset = offset_in_folio(new_folio, tmp); done_copy_out = 1; /* @@ -438,12 +435,12 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, * copying, we can finally do so. */ if (do_escape) { - mapped_data = kmap_atomic(new_page); - *((unsigned int *)(mapped_data + new_offset)) = 0; - kunmap_atomic(mapped_data); + mapped_data = kmap_local_folio(new_folio, new_offset); + *((unsigned int *)mapped_data) = 0; + kunmap_local(mapped_data); } - set_bh_page(new_bh, new_page, new_offset); + folio_set_bh(new_bh, new_folio, new_offset); new_bh->b_size = bh_in->b_size; new_bh->b_bdev = journal->j_dev; new_bh->b_blocknr = blocknr; From 5f6d28622ffc7fa356b2745b088c831ebb8546b0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 13 Jul 2023 04:55:12 +0100 Subject: [PATCH 134/489] buffer: remove set_bh_page() With all users converted to folio_set_bh(), remove this function. Link: https://lkml.kernel.org/r/20230713035512.4139457-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christian Brauner Cc: David Sterba Cc: Konstantin Komarov Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Pankaj Raghav Cc: "Theodore Ts'o" Cc: Tom Rix Signed-off-by: Andrew Morton --- fs/buffer.c | 15 --------------- include/linux/buffer_head.h | 2 -- 2 files changed, 17 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 587e4d4af9deef..f0563ebae75f64 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1539,21 +1539,6 @@ void invalidate_bh_lrus_cpu(void) bh_lru_unlock(); } -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset) -{ - bh->b_page = page; - BUG_ON(offset >= PAGE_SIZE); - if (PageHighMem(page)) - /* - * This catches illegal uses and preserves the offset: - */ - bh->b_data = (char *)(0 + offset); - else - bh->b_data = page_address(page) + offset; -} -EXPORT_SYMBOL(set_bh_page); - void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset) { diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index a7377877ff4ed7..06566aee94ca4a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -194,8 +194,6 @@ void buffer_check_dirty_writeback(struct folio *folio, void mark_buffer_dirty(struct buffer_head *bh); void mark_buffer_write_io_error(struct buffer_head *bh); void touch_buffer(struct buffer_head *bh); -void set_bh_page(struct buffer_head *bh, - struct page *page, unsigned long offset); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); bool try_to_free_buffers(struct folio *); From 063ff7cd8bf24aa14c897b6168591d3d0dae2a5e Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 19:47:47 +0800 Subject: [PATCH 135/489] mm/page_ext: remove unused return value of offline_page_ext Patch series "minor cleanups for page_ext". This series contains some random minor cleanups for page_ext. More details can be found in respective patches. This patch (of 3): offline_page_ext always returns 0 and no caller checks the return value. Just remove unused return value of offline_page_ext. Link: https://lkml.kernel.org/r/20230714114749.1743032-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230714114749.1743032-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_ext.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_ext.c b/mm/page_ext.c index dc1626be458bff..096451df1c87cb 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -430,7 +430,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, return -ENOMEM; } -static int __meminit offline_page_ext(unsigned long start_pfn, +static void __meminit offline_page_ext(unsigned long start_pfn, unsigned long nr_pages) { unsigned long start, end, pfn; @@ -454,8 +454,6 @@ static int __meminit offline_page_ext(unsigned long start_pfn, for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); - return 0; - } static int __meminit page_ext_callback(struct notifier_block *self, From 3c09be5a2be861d7f74b0251a8e77859b4c654cc Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 19:47:48 +0800 Subject: [PATCH 136/489] mm/page_ext: remove rollback for untouched mem_section in online_page_ext If init_section_page_ext failed, we only need rollback for mem_section before failed mem_section. Make rollback end point to failed mem_section to remove unnecessary rollback. As pfn += PAGES_PER_SECTION will be executed even if init_section_page_ext failed. So pfn points to mem_section after failed mem_section. Subtract one mem_section from pfn to get failed mem_section. Link: https://lkml.kernel.org/r/20230714114749.1743032-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_ext.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_ext.c b/mm/page_ext.c index 096451df1c87cb..f052397dc70f14 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -424,6 +424,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, return 0; /* rollback */ + end = pfn - PAGES_PER_SECTION; for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); From eb0da7f6e0832a689d845ca2d62152dc6b43e780 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 14 Jul 2023 19:47:49 +0800 Subject: [PATCH 137/489] mm/page_ext: move functions around for minor cleanups to page_ext 1. move page_ext_get and page_ext_put down to remove forward declaration of lookup_page_ext. 2. move page_ext_init_flatmem_late down to existing non SPARS block to remove a new non SPARS block and to keep code for non SPARS tight. Link: https://lkml.kernel.org/r/20230714114749.1743032-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Joonsoo Kim Signed-off-by: Andrew Morton --- mm/page_ext.c | 96 ++++++++++++++++++++++++--------------------------- 1 file changed, 46 insertions(+), 50 deletions(-) diff --git a/mm/page_ext.c b/mm/page_ext.c index f052397dc70f14..4548fcc66d74d0 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -90,7 +90,6 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { unsigned long page_ext_size; static unsigned long total_usage; -static struct page_ext *lookup_page_ext(const struct page *page); bool early_page_ext __meminitdata; static int __init setup_early_page_ext(char *str) @@ -137,62 +136,16 @@ static void __init invoke_init_callbacks(void) } } -#ifndef CONFIG_SPARSEMEM -void __init page_ext_init_flatmem_late(void) -{ - invoke_init_callbacks(); -} -#endif - static inline struct page_ext *get_entry(void *base, unsigned long index) { return base + page_ext_size * index; } -/** - * page_ext_get() - Get the extended information for a page. - * @page: The page we're interested in. - * - * Ensures that the page_ext will remain valid until page_ext_put() - * is called. - * - * Return: NULL if no page_ext exists for this page. - * Context: Any context. Caller may not sleep until they have called - * page_ext_put(). - */ -struct page_ext *page_ext_get(struct page *page) -{ - struct page_ext *page_ext; - - rcu_read_lock(); - page_ext = lookup_page_ext(page); - if (!page_ext) { - rcu_read_unlock(); - return NULL; - } - - return page_ext; -} - -/** - * page_ext_put() - Working with page extended information is done. - * @page_ext: Page extended information received from page_ext_get(). - * - * The page extended information of the page may not be valid after this - * function is called. - * - * Return: None. - * Context: Any context with corresponding page_ext_get() is called. - */ -void page_ext_put(struct page_ext *page_ext) +#ifndef CONFIG_SPARSEMEM +void __init page_ext_init_flatmem_late(void) { - if (unlikely(!page_ext)) - return; - - rcu_read_unlock(); + invoke_init_callbacks(); } -#ifndef CONFIG_SPARSEMEM - void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) { @@ -536,3 +489,46 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) } #endif + +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext: Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} From efb78fa86e95832b78ca0ba60f3706788a818938 Mon Sep 17 00:00:00 2001 From: Andrew Donnellan Date: Fri, 14 Jul 2023 11:52:38 +1000 Subject: [PATCH 138/489] lib/test_meminit: allocate pages up to order MAX_ORDER test_pages() tests the page allocator by calling alloc_pages() with different orders up to order 10. However, different architectures and platforms support different maximum contiguous allocation sizes. The default maximum allocation order (MAX_ORDER) is 10, but architectures can use CONFIG_ARCH_FORCE_MAX_ORDER to override this. On platforms where this is less than 10, test_meminit() will blow up with a WARN(). This is expected, so let's not do that. Replace the hardcoded "10" with the MAX_ORDER macro so that we test allocations up to the expected platform limit. Link: https://lkml.kernel.org/r/20230714015238.47931-1-ajd@linux.ibm.com Fixes: 5015a300a522 ("lib: introduce test_meminit module") Signed-off-by: Andrew Donnellan Reviewed-by: Alexander Potapenko Cc: Xiaoke Wang Cc: Signed-off-by: Andrew Morton --- lib/test_meminit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 60e1984c060fac..0ae35223d77335 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) int failures = 0, num_tests = 0; int i; - for (i = 0; i < 10; i++) + for (i = 0; i <= MAX_ORDER; i++) num_tests += do_alloc_pages_order(i, &failures); REPORT_FAILURES_IN_FN(); From 0b1f77e74b5a9234e83dc89f1593b769547a37fa Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:02 +0800 Subject: [PATCH 139/489] asm-generic/iomap.h: remove ARCH_HAS_IOREMAP_xx macros Patch series "mm: ioremap: Convert architectures to take GENERIC_IOREMAP way", v8. Motivation and implementation: ============================== Currently, many architecutres have't taken the standard GENERIC_IOREMAP way to implement ioremap_prot(), iounmap(), and ioremap_xx(), but make these functions specifically under each arch's folder. Those cause many duplicated code of ioremap() and iounmap(). In this patchset, firstly introduce generic_ioremap_prot() and generic_iounmap() to extract the generic code for GENERIC_IOREMAP. By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic version if there's arch specific handling in its corresponding ioremap_prot(), ioremap() or iounmap(). With these changes, duplicated ioremap/iounmap() code uder ARCH-es are removed, and the equivalent functioality is kept as before. Background info: ================ 1) The converting more architectures to take GENERIC_IOREMAP way is suggested by Christoph in below discussion: https://lore.kernel.org/all/Yp7h0Jv6vpgt6xdZ@infradead.org/T/#u 2) In the previous v1 to v3, it's basically further action after arm64 has converted to GENERIC_IOREMAP way in below patchset. It's done by adding hook ioremap_allowed() and iounmap_allowed() in ARCH to add ARCH specific handling the middle of ioremap_prot() and iounmap(). [PATCH v5 0/6] arm64: Cleanup ioremap() and support ioremap_prot() https://lore.kernel.org/all/20220607125027.44946-1-wangkefeng.wang@huawei.com/T/#u Later, during v3 reviewing, Christophe Leroy suggested to introduce generic_ioremap_prot() and generic_iounmap() to generic codes, and ARCH can provide wrapper function ioremap_prot(), ioremap() or iounmap() if needed. Christophe made a RFC patchset as below to specially demonstrate his idea. This is what v4 and now v5 is doing. [RFC PATCH 0/8] mm: ioremap: Convert architectures to take GENERIC_IOREMAP way https://lore.kernel.org/all/cover.1665568707.git.christophe.leroy@csgroup.eu/T/#u Testing: ======== In v8, I only applied this patchset onto the latest linus's tree to build and run on arm64 and s390. This patch (of 19): Let's use '#define ioremap_xx' and "#ifdef ioremap_xx" instead. To remove defined ARCH_HAS_IOREMAP_xx macros in of each ARCH, the ARCH's own ioremap_wc|wt|np definition need be above "#include . Otherwise the redefinition error would be seen during compiling. So the relevant adjustments are made to avoid compiling error: loongarch: - doesn't include , defining ARCH_HAS_IOREMAP_WC is redundant, so simply remove it. m68k: - selected GENERIC_IOMAP, has been added in , and is included above , so simply remove ARCH_HAS_IOREMAP_WT defining. mips: - move "#include " below ioremap_wc definition in powerpc: - remove "#include " in because it's duplicated with the one in , let's rely on the latter. x86: - selected GENERIC_IOMAP, remove #include in the middle of . Let's rely on . Link: https://lkml.kernel.org/r/20230706154520.11257-2-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Geert Uytterhoeven Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Christophe Leroy Cc: David Laight Cc: Helge Deller Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Nathan Chancellor Cc: Niklas Schnelle Cc: Stafford Horne Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: Gerald Schaefer Cc: Heiko Carstens Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Max Filippov Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Rich Felker Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/io.h | 2 -- arch/m68k/include/asm/io_mm.h | 2 -- arch/m68k/include/asm/kmap.h | 2 -- arch/mips/include/asm/io.h | 5 ++--- arch/powerpc/include/asm/io.h | 9 +-------- arch/x86/include/asm/io.h | 5 ----- drivers/net/ethernet/sfc/io.h | 2 +- drivers/net/ethernet/sfc/siena/io.h | 2 +- include/asm-generic/iomap.h | 6 +++--- 9 files changed, 8 insertions(+), 27 deletions(-) diff --git a/arch/loongarch/include/asm/io.h b/arch/loongarch/include/asm/io.h index 1c94102200407f..0dcb36b32cb252 100644 --- a/arch/loongarch/include/asm/io.h +++ b/arch/loongarch/include/asm/io.h @@ -5,8 +5,6 @@ #ifndef _ASM_IO_H #define _ASM_IO_H -#define ARCH_HAS_IOREMAP_WC - #include #include diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index d41fa488453b88..6a0abd4846c682 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -26,8 +26,6 @@ #include #include -#include - #ifdef CONFIG_ATARI #define atari_readb raw_inb #define atari_writeb raw_outb diff --git a/arch/m68k/include/asm/kmap.h b/arch/m68k/include/asm/kmap.h index dec05743d42651..4efb3efa593a4f 100644 --- a/arch/m68k/include/asm/kmap.h +++ b/arch/m68k/include/asm/kmap.h @@ -4,8 +4,6 @@ #ifdef CONFIG_MMU -#define ARCH_HAS_IOREMAP_WT - /* Values for nocacheflag and cmode */ #define IOMAP_FULL_CACHING 0 #define IOMAP_NOCACHE_SER 1 diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index affd21e9c20b42..062dd4e6b954e4 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -12,8 +12,6 @@ #ifndef _ASM_IO_H #define _ASM_IO_H -#define ARCH_HAS_IOREMAP_WC - #include #include #include @@ -25,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -210,6 +207,8 @@ void iounmap(const volatile void __iomem *addr); #define ioremap_wc(offset, size) \ ioremap_prot((offset), (size), boot_cpu_data.writecombine) +#include + #if defined(CONFIG_CPU_CAVIUM_OCTEON) #define war_io_reorder_wmb() wmb() #else diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index f1e657c9bbe8e8..67a3fb6de498ef 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -3,11 +3,6 @@ #define _ASM_POWERPC_IO_H #ifdef __KERNEL__ -#define ARCH_HAS_IOREMAP_WC -#ifdef CONFIG_PPC32 -#define ARCH_HAS_IOREMAP_WT -#endif - /* */ @@ -732,9 +727,7 @@ static inline void name at \ #define writel_relaxed(v, addr) writel(v, addr) #define writeq_relaxed(v, addr) writeq(v, addr) -#ifdef CONFIG_GENERIC_IOMAP -#include -#else +#ifndef CONFIG_GENERIC_IOMAP /* * Here comes the implementation of the IOMAP interfaces. */ diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index e9025640f634a3..76238842406a2f 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -35,9 +35,6 @@ * - Arnaldo Carvalho de Melo */ -#define ARCH_HAS_IOREMAP_WC -#define ARCH_HAS_IOREMAP_WT - #include #include #include @@ -212,8 +209,6 @@ void memset_io(volatile void __iomem *, int, size_t); #define memcpy_toio memcpy_toio #define memset_io memset_io -#include - /* * ISA space is 'always mapped' on a typical x86 system, no need to * explicitly ioremap() it. The fact that the ISA IO space is mapped diff --git a/drivers/net/ethernet/sfc/io.h b/drivers/net/ethernet/sfc/io.h index 30439cc83a8913..07f99ad14bf3d7 100644 --- a/drivers/net/ethernet/sfc/io.h +++ b/drivers/net/ethernet/sfc/io.h @@ -70,7 +70,7 @@ */ #ifdef CONFIG_X86_64 /* PIO is a win only if write-combining is possible */ -#ifdef ARCH_HAS_IOREMAP_WC +#ifdef ioremap_wc #define EFX_USE_PIO 1 #endif #endif diff --git a/drivers/net/ethernet/sfc/siena/io.h b/drivers/net/ethernet/sfc/siena/io.h index 30439cc83a8913..07f99ad14bf3d7 100644 --- a/drivers/net/ethernet/sfc/siena/io.h +++ b/drivers/net/ethernet/sfc/siena/io.h @@ -70,7 +70,7 @@ */ #ifdef CONFIG_X86_64 /* PIO is a win only if write-combining is possible */ -#ifdef ARCH_HAS_IOREMAP_WC +#ifdef ioremap_wc #define EFX_USE_PIO 1 #endif #endif diff --git a/include/asm-generic/iomap.h b/include/asm-generic/iomap.h index 08237ae8b840d8..196087a8126e5e 100644 --- a/include/asm-generic/iomap.h +++ b/include/asm-generic/iomap.h @@ -93,15 +93,15 @@ extern void __iomem *ioport_map(unsigned long port, unsigned int nr); extern void ioport_unmap(void __iomem *); #endif -#ifndef ARCH_HAS_IOREMAP_WC +#ifndef ioremap_wc #define ioremap_wc ioremap #endif -#ifndef ARCH_HAS_IOREMAP_WT +#ifndef ioremap_wt #define ioremap_wt ioremap #endif -#ifndef ARCH_HAS_IOREMAP_NP +#ifndef ioremap_np /* See the comment in asm-generic/io.h about ioremap_np(). */ #define ioremap_np ioremap_np static inline void __iomem *ioremap_np(phys_addr_t offset, size_t size) From 5bd2cc56667d9357c040e1980811fcdade79837e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:03 +0800 Subject: [PATCH 140/489] hexagon: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic ioremap_prot() and iounmap() are visible and available to arch. This change will simplify implementation by removing duplicated code with generic ioremap_prot() and iounmap(), and has the equivalent functioality. For hexagon, the current ioremap() and iounmap() are the same as generic version. After taking GENERIC_IOREMAP way, the old ioremap() and iounmap() can be completely removed. Link: https://lkml.kernel.org/r/20230706154520.11257-3-bhe@redhat.com Signed-off-by: Baoquan He Cc: Brian Cain Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/hexagon/Kconfig | 1 + arch/hexagon/include/asm/io.h | 11 +++++--- arch/hexagon/kernel/hexagon_ksyms.c | 2 -- arch/hexagon/mm/Makefile | 2 +- arch/hexagon/mm/ioremap.c | 44 ----------------------------- 5 files changed, 9 insertions(+), 51 deletions(-) delete mode 100644 arch/hexagon/mm/ioremap.c diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 6726f4941015f3..a880ee067d2ec2 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -25,6 +25,7 @@ config HEXAGON select NEED_SG_DMA_LENGTH select NO_IOPORT_MAP select GENERIC_IOMAP + select GENERIC_IOREMAP select GENERIC_SMP_IDLE_THREAD select STACKTRACE_SUPPORT select GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index 46a099de85b7f3..e2b308e32a379d 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -27,8 +27,6 @@ extern int remap_area_pages(unsigned long start, unsigned long phys_addr, unsigned long end, unsigned long flags); -extern void iounmap(const volatile void __iomem *addr); - /* Defined in lib/io.c, needed for smc91x driver. */ extern void __raw_readsw(const void __iomem *addr, void *data, int wordlen); extern void __raw_writesw(void __iomem *addr, const void *data, int wordlen); @@ -170,8 +168,13 @@ static inline void writel(u32 data, volatile void __iomem *addr) #define writew_relaxed __raw_writew #define writel_relaxed __raw_writel -void __iomem *ioremap(unsigned long phys_addr, unsigned long size); -#define ioremap_uc(X, Y) ioremap((X), (Y)) +/* + * I/O memory mapping functions. + */ +#define _PAGE_IOREMAP (_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \ + (__HEXAGON_C_DEV << 6)) + +#define ioremap_uc(addr, size) ioremap((addr), (size)) #define __raw_writel writel diff --git a/arch/hexagon/kernel/hexagon_ksyms.c b/arch/hexagon/kernel/hexagon_ksyms.c index ec56ce2d92a291..36a80e31d18795 100644 --- a/arch/hexagon/kernel/hexagon_ksyms.c +++ b/arch/hexagon/kernel/hexagon_ksyms.c @@ -14,12 +14,10 @@ EXPORT_SYMBOL(__clear_user_hexagon); EXPORT_SYMBOL(raw_copy_from_user); EXPORT_SYMBOL(raw_copy_to_user); -EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(__vmgetie); EXPORT_SYMBOL(__vmsetie); EXPORT_SYMBOL(__vmyield); EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(ioremap); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); diff --git a/arch/hexagon/mm/Makefile b/arch/hexagon/mm/Makefile index 49911a906fd005..ba4b04d962d6b8 100644 --- a/arch/hexagon/mm/Makefile +++ b/arch/hexagon/mm/Makefile @@ -3,5 +3,5 @@ # Makefile for Hexagon memory management subsystem # -obj-y := init.o ioremap.o uaccess.o vm_fault.o cache.o +obj-y := init.o uaccess.o vm_fault.o cache.o obj-y += copy_to_user.o copy_from_user.o vm_tlb.o diff --git a/arch/hexagon/mm/ioremap.c b/arch/hexagon/mm/ioremap.c deleted file mode 100644 index 255c5b1ee1a711..00000000000000 --- a/arch/hexagon/mm/ioremap.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * I/O remap functions for Hexagon - * - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#include -#include -#include - -void __iomem *ioremap(unsigned long phys_addr, unsigned long size) -{ - unsigned long last_addr, addr; - unsigned long offset = phys_addr & ~PAGE_MASK; - struct vm_struct *area; - - pgprot_t prot = __pgprot(_PAGE_PRESENT|_PAGE_READ|_PAGE_WRITE - |(__HEXAGON_C_DEV << 6)); - - last_addr = phys_addr + size - 1; - - /* Wrapping not allowed */ - if (!size || (last_addr < phys_addr)) - return NULL; - - /* Rounds up to next page size, including whole-page offset */ - size = PAGE_ALIGN(offset + size); - - area = get_vm_area(size, VM_IOREMAP); - addr = (unsigned long)area->addr; - - if (ioremap_page_range(addr, addr+size, phys_addr, prot)) { - vunmap((void *)addr); - return NULL; - } - - return (void __iomem *) (offset + addr); -} - -void iounmap(const volatile void __iomem *addr) -{ - vunmap((void *) ((unsigned long) addr & PAGE_MASK)); -} From 53c98e35dcbcacdb8e5c4d9c8fd6dfa8962af5c7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:04 +0800 Subject: [PATCH 141/489] openrisc: mm: remove unneeded early ioremap code Under arch/openrisc, there isn't any place where ioremap() is called. It means that there isn't early ioremap handling needed in openrisc, So the early ioremap handling code in ioremap() of arch/openrisc/mm/ioremap.c is unnecessary and can be removed. And also remove the special handling in iounmap() since no page is got from fixmap pool along with early ioremap code removing in ioremap(). Link: https://lore.kernel.org/linux-mm/YwxfxKrTUtAuejKQ@oscomms1/ Link: https://lkml.kernel.org/r/20230706154520.11257-4-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Stafford Horne Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/mm/ioremap.c | 43 +++++--------------------------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index 8ec0dafecf257e..cdbcc7e73684fb 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -22,8 +22,6 @@ extern int mem_init_done; -static unsigned int fixmaps_used __initdata; - /* * Remap an arbitrary physical address space into the kernel virtual * address space. Needed when the kernel wants to access high addresses @@ -52,24 +50,14 @@ void __iomem *__ref ioremap(phys_addr_t addr, unsigned long size) p = addr & PAGE_MASK; size = PAGE_ALIGN(last_addr + 1) - p; - if (likely(mem_init_done)) { - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - v = (unsigned long)area->addr; - } else { - if ((fixmaps_used + (size >> PAGE_SHIFT)) > FIX_N_IOREMAPS) - return NULL; - v = fix_to_virt(FIX_IOREMAP_BEGIN + fixmaps_used); - fixmaps_used += (size >> PAGE_SHIFT); - } + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + v = (unsigned long)area->addr; if (ioremap_page_range(v, v + size, p, __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_CI))) { - if (likely(mem_init_done)) - vfree(area->addr); - else - fixmaps_used -= (size >> PAGE_SHIFT); + vfree(area->addr); return NULL; } @@ -79,27 +67,6 @@ EXPORT_SYMBOL(ioremap); void iounmap(volatile void __iomem *addr) { - /* If the page is from the fixmap pool then we just clear out - * the fixmap mapping. - */ - if (unlikely((unsigned long)addr > FIXADDR_START)) { - /* This is a bit broken... we don't really know - * how big the area is so it's difficult to know - * how many fixed pages to invalidate... - * just flush tlb and hope for the best... - * consider this a FIXME - * - * Really we should be clearing out one or more page - * table entries for these virtual addresses so that - * future references cause a page fault... for now, we - * rely on two things: - * i) this code never gets called on known boards - * ii) invalid accesses to the freed areas aren't made - */ - flush_tlb_all(); - return; - } - return vfree((void *)(PAGE_MASK & (unsigned long)addr)); } EXPORT_SYMBOL(iounmap); From 7613366a190202a8ebe8090ca4758b551f1b7feb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:05 +0800 Subject: [PATCH 142/489] mm/ioremap: define generic_ioremap_prot() and generic_iounmap() Define a generic version of ioremap_prot() and iounmap() that architectures can call after they have performed the necessary alteration to parameters and/or necessary verifications. Link: https://lkml.kernel.org/r/20230706154520.11257-5-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 4 ++++ mm/ioremap.c | 22 ++++++++++++++++------ 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 587e7e9b9a375c..a7ca2099ba195a 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1073,9 +1073,13 @@ static inline bool iounmap_allowed(void *addr) } #endif +void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t prot); + void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot); void iounmap(volatile void __iomem *addr); +void generic_iounmap(volatile void __iomem *addr); static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { diff --git a/mm/ioremap.c b/mm/ioremap.c index 8652426282cc5a..db6234b9db59a5 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -11,8 +11,8 @@ #include #include -void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, - unsigned long prot) +void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, + pgprot_t prot) { unsigned long offset, vaddr; phys_addr_t last_addr; @@ -28,7 +28,7 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, phys_addr -= offset; size = PAGE_ALIGN(size + offset); - if (!ioremap_allowed(phys_addr, size, prot)) + if (!ioremap_allowed(phys_addr, size, pgprot_val(prot))) return NULL; area = get_vm_area_caller(size, VM_IOREMAP, @@ -38,17 +38,22 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, vaddr = (unsigned long)area->addr; area->phys_addr = phys_addr; - if (ioremap_page_range(vaddr, vaddr + size, phys_addr, - __pgprot(prot))) { + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { free_vm_area(area); return NULL; } return (void __iomem *)(vaddr + offset); } + +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) +{ + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); +} EXPORT_SYMBOL(ioremap_prot); -void iounmap(volatile void __iomem *addr) +void generic_iounmap(volatile void __iomem *addr) { void *vaddr = (void *)((unsigned long)addr & PAGE_MASK); @@ -58,4 +63,9 @@ void iounmap(volatile void __iomem *addr) if (is_vmalloc_addr(vaddr)) vunmap(vaddr); } + +void iounmap(volatile void __iomem *addr) +{ + generic_iounmap(addr); +} EXPORT_SYMBOL(iounmap); From dfdc6ba95768b4935058dcf2a8a09874289ba88f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:06 +0800 Subject: [PATCH 143/489] mm: ioremap: allow ARCH to have its own ioremap method definition Architectures can be converted to GENERIC_IOREMAP, to take standard ioremap_xxx() and iounmap() way. But some ARCH-es could have specific handling for ioremap_prot(), ioremap() and iounmap(), than standard methods. In oder to convert these ARCH-es to take GENERIC_IOREMAP method, allow these architecutres to have their own ioremap_prot(), ioremap() and iounmap() definitions. Link: https://lkml.kernel.org/r/20230706154520.11257-6-bhe@redhat.com Signed-off-by: Baoquan He Acked-by: Arnd Bergmann Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 3 +++ mm/ioremap.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index a7ca2099ba195a..39244c3ee79768 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1081,11 +1081,14 @@ void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, void iounmap(volatile void __iomem *addr); void generic_iounmap(volatile void __iomem *addr); +#ifndef ioremap +#define ioremap ioremap static inline void __iomem *ioremap(phys_addr_t addr, size_t size) { /* _PAGE_IOREMAP needs to be supplied by the architecture */ return ioremap_prot(addr, size, _PAGE_IOREMAP); } +#endif #endif /* !CONFIG_MMU || CONFIG_GENERIC_IOREMAP */ #ifndef ioremap_wc diff --git a/mm/ioremap.c b/mm/ioremap.c index db6234b9db59a5..9f34a8f90b5890 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -46,12 +46,14 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, return (void __iomem *)(vaddr + offset); } +#ifndef ioremap_prot void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, unsigned long prot) { return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } EXPORT_SYMBOL(ioremap_prot); +#endif void generic_iounmap(volatile void __iomem *addr) { @@ -64,8 +66,10 @@ void generic_iounmap(volatile void __iomem *addr) vunmap(vaddr); } +#ifndef iounmap void iounmap(volatile void __iomem *addr) { generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); +#endif From a5f6164831104a7441a0c4101c5b74cd59fbdfa6 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:07 +0800 Subject: [PATCH 144/489] mm/ioremap: add slab availability checking in ioremap_prot Several architectures has done checking if slab if available in ioremap_prot(). In fact it should be done in generic ioremap_prot() since on any architecutre, slab allocator must be available before get_vm_area_caller() and vunmap() are used. Add the checking into generic_ioremap_prot(). Link: https://lkml.kernel.org/r/20230706154520.11257-7-bhe@redhat.com Suggested-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/ioremap.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/ioremap.c b/mm/ioremap.c index 9f34a8f90b5890..86b82ec27d2bb4 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -18,6 +18,10 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, phys_addr_t last_addr; struct vm_struct *area; + /* An early platform driver might end up here */ + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + /* Disallow wrap-around or zero size */ last_addr = phys_addr + size - 1; if (!size || last_addr < phys_addr) From 06dfae39d20091f3c8aa995f00505e1e148b0b3f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:08 +0800 Subject: [PATCH 145/489] arc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for arc's special operation when ioremap_prot() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-8-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Vineet Gupta Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arc/Kconfig | 1 + arch/arc/include/asm/io.h | 7 +++--- arch/arc/mm/ioremap.c | 49 ++++----------------------------------- 3 files changed, 8 insertions(+), 49 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 96cf8720bb9391..6f4995ad98737a 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -26,6 +26,7 @@ config ARC select GENERIC_PENDING_IRQ if SMP select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD + select GENERIC_IOREMAP select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE if ARC_MMU_V4 diff --git a/arch/arc/include/asm/io.h b/arch/arc/include/asm/io.h index 80347382a38009..4fdb7350636c32 100644 --- a/arch/arc/include/asm/io.h +++ b/arch/arc/include/asm/io.h @@ -21,8 +21,9 @@ #endif extern void __iomem *ioremap(phys_addr_t paddr, unsigned long size); -extern void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, - unsigned long flags); +#define ioremap ioremap +#define ioremap_prot ioremap_prot +#define iounmap iounmap static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { return (void __iomem *)port; @@ -32,8 +33,6 @@ static inline void ioport_unmap(void __iomem *addr) { } -extern void iounmap(const volatile void __iomem *addr); - /* * io{read,write}{16,32}be() macros */ diff --git a/arch/arc/mm/ioremap.c b/arch/arc/mm/ioremap.c index 712c2311daefb5..b07004d5326782 100644 --- a/arch/arc/mm/ioremap.c +++ b/arch/arc/mm/ioremap.c @@ -8,7 +8,6 @@ #include #include #include -#include #include static inline bool arc_uncached_addr_space(phys_addr_t paddr) @@ -25,13 +24,6 @@ static inline bool arc_uncached_addr_space(phys_addr_t paddr) void __iomem *ioremap(phys_addr_t paddr, unsigned long size) { - phys_addr_t end; - - /* Don't allow wraparound or zero size */ - end = paddr + size - 1; - if (!size || (end < paddr)) - return NULL; - /* * If the region is h/w uncached, MMU mapping can be elided as optim * The cast to u32 is fine as this region can only be inside 4GB @@ -51,55 +43,22 @@ EXPORT_SYMBOL(ioremap); * ARC hardware uncached region, this one still goes thru the MMU as caller * might need finer access control (R/W/X) */ -void __iomem *ioremap_prot(phys_addr_t paddr, unsigned long size, +void __iomem *ioremap_prot(phys_addr_t paddr, size_t size, unsigned long flags) { - unsigned int off; - unsigned long vaddr; - struct vm_struct *area; - phys_addr_t end; pgprot_t prot = __pgprot(flags); - /* Don't allow wraparound, zero size */ - end = paddr + size - 1; - if ((!size) || (end < paddr)) - return NULL; - - /* An early platform driver might end up here */ - if (!slab_is_available()) - return NULL; - /* force uncached */ - prot = pgprot_noncached(prot); - - /* Mappings have to be page-aligned */ - off = paddr & ~PAGE_MASK; - paddr &= PAGE_MASK_PHYS; - size = PAGE_ALIGN(end + 1) - paddr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - area->phys_addr = paddr; - vaddr = (unsigned long)area->addr; - if (ioremap_page_range(vaddr, vaddr + size, paddr, prot)) { - vunmap((void __force *)vaddr); - return NULL; - } - return (void __iomem *)(off + (char __iomem *)vaddr); + return generic_ioremap_prot(paddr, size, pgprot_noncached(prot)); } EXPORT_SYMBOL(ioremap_prot); - -void iounmap(const volatile void __iomem *addr) +void iounmap(volatile void __iomem *addr) { /* weird double cast to handle phys_addr_t > 32 bits */ if (arc_uncached_addr_space((phys_addr_t)(u32)addr)) return; - vfree((void *)(PAGE_MASK & (unsigned long __force)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); From 38d110aba3c4f3ab9a2bdf7862ae11afe5c96c43 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:09 +0800 Subject: [PATCH 146/489] ia64: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for ia64's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-9-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/ia64/Kconfig | 1 + arch/ia64/include/asm/io.h | 13 +++++------- arch/ia64/mm/ioremap.c | 41 ++++++-------------------------------- 3 files changed, 12 insertions(+), 43 deletions(-) diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 2cd93e6bf0fec6..3ab75f36c037fc 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -47,6 +47,7 @@ config IA64 select GENERIC_IRQ_LEGACY select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP + select GENERIC_IOREMAP select GENERIC_SMP_IDLE_THREAD select ARCH_TASK_STRUCT_ON_STACK select ARCH_TASK_STRUCT_ALLOCATOR diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 83a492c8d2985e..eedc0afa8cad30 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -243,15 +243,12 @@ static inline void outsl(unsigned long port, const void *src, # ifdef __KERNEL__ -extern void __iomem * ioremap(unsigned long offset, unsigned long size); +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + extern void __iomem * ioremap_uc(unsigned long offset, unsigned long size); -extern void iounmap (volatile void __iomem *addr); -static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) -{ - return ioremap(phys_addr, size); -} -#define ioremap ioremap -#define ioremap_cache ioremap_cache + +#define ioremap_prot ioremap_prot +#define ioremap_cache ioremap #define ioremap_uc ioremap_uc #define iounmap iounmap diff --git a/arch/ia64/mm/ioremap.c b/arch/ia64/mm/ioremap.c index 92b81bc91397f7..711b6abc822eb6 100644 --- a/arch/ia64/mm/ioremap.c +++ b/arch/ia64/mm/ioremap.c @@ -29,13 +29,9 @@ early_ioremap (unsigned long phys_addr, unsigned long size) return __ioremap_uc(phys_addr); } -void __iomem * -ioremap (unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long flags) { - void __iomem *addr; - struct vm_struct *area; - unsigned long offset; - pgprot_t prot; u64 attr; unsigned long gran_base, gran_size; unsigned long page_base; @@ -68,36 +64,12 @@ ioremap (unsigned long phys_addr, unsigned long size) */ page_base = phys_addr & PAGE_MASK; size = PAGE_ALIGN(phys_addr + size) - page_base; - if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) { - prot = PAGE_KERNEL; - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long) addr, - (unsigned long) addr + size, phys_addr, prot)) { - vunmap((void __force *) addr); - return NULL; - } - - return (void __iomem *) (offset + (char __iomem *)addr); - } + if (efi_mem_attribute(page_base, size) & EFI_MEMORY_WB) + return generic_ioremap_prot(phys_addr, size, __pgprot(flags)); return __ioremap_uc(phys_addr); } -EXPORT_SYMBOL(ioremap); +EXPORT_SYMBOL(ioremap_prot); void __iomem * ioremap_uc(unsigned long phys_addr, unsigned long size) @@ -114,8 +86,7 @@ early_iounmap (volatile void __iomem *addr, unsigned long size) { } -void -iounmap (volatile void __iomem *addr) +void iounmap(volatile void __iomem *addr) { if (REGION_NUMBER(addr) == RGN_GATE) vunmap((void *) ((unsigned long) addr & PAGE_MASK)); From 9b994429fe1808d0b2caa85f3afcf88e007e2e79 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:10 +0800 Subject: [PATCH 147/489] openrisc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. For openrisc, the current ioremap() and iounmap() are the same as generic version. After taking GENERIC_IOREMAP way, the old ioremap() and iounmap() can be completely removed. Link: https://lkml.kernel.org/r/20230706154520.11257-10-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Stafford Horne Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/Kconfig | 1 + arch/openrisc/include/asm/io.h | 11 ++++---- arch/openrisc/mm/ioremap.c | 49 ---------------------------------- 3 files changed, 7 insertions(+), 54 deletions(-) diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index c7f282f60f64c5..fd9bb76a610bf0 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -21,6 +21,7 @@ config OPENRISC select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP + select GENERIC_IOREMAP select GENERIC_CPU_DEVICES select HAVE_PCI select HAVE_UID16 diff --git a/arch/openrisc/include/asm/io.h b/arch/openrisc/include/asm/io.h index ee6043a03173de..5a6f0f16a5ce5b 100644 --- a/arch/openrisc/include/asm/io.h +++ b/arch/openrisc/include/asm/io.h @@ -15,6 +15,8 @@ #define __ASM_OPENRISC_IO_H #include +#include +#include /* * PCI: We do not use IO ports in OpenRISC @@ -27,11 +29,10 @@ #define PIO_OFFSET 0 #define PIO_MASK 0 -#define ioremap ioremap -void __iomem *ioremap(phys_addr_t offset, unsigned long size); - -#define iounmap iounmap -extern void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define _PAGE_IOREMAP (pgprot_val(PAGE_KERNEL) | _PAGE_CI) #include diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index cdbcc7e73684fb..91c8259d4b7ed6 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -22,55 +22,6 @@ extern int mem_init_done; -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem *__ref ioremap(phys_addr_t addr, unsigned long size) -{ - phys_addr_t p; - unsigned long v; - unsigned long offset, last_addr; - struct vm_struct *area = NULL; - - /* Don't allow wraparound or zero size */ - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - - /* - * Mappings have to be page-aligned - */ - offset = addr & ~PAGE_MASK; - p = addr & PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - p; - - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - v = (unsigned long)area->addr; - - if (ioremap_page_range(v, v + size, p, - __pgprot(pgprot_val(PAGE_KERNEL) | _PAGE_CI))) { - vfree(area->addr); - return NULL; - } - - return (void __iomem *)(offset + (char *)v); -} -EXPORT_SYMBOL(ioremap); - -void iounmap(volatile void __iomem *addr) -{ - return vfree((void *)(PAGE_MASK & (unsigned long)addr)); -} -EXPORT_SYMBOL(iounmap); - /** * OK, this one's a bit tricky... ioremap can get called before memory is * initialized (early serial console does this) and will want to alloc a page From b43b3fff042d08a0bcc0a6d87c3216b44860298e Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:11 +0800 Subject: [PATCH 148/489] s390: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for s390's special operation when ioremap() and iounmap(). And also replace including with in arch/s390/kernel/perf_cpum_sf.c, otherwise building error will be seen because macro defined in can't be seen in perf_cpum_sf.c. Link: https://lkml.kernel.org/r/20230706154520.11257-11-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Niklas Schnelle Tested-by: Niklas Schnelle Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/Kconfig | 1 + arch/s390/include/asm/io.h | 21 ++++++++------ arch/s390/pci/pci.c | 57 +++++++------------------------------- 3 files changed, 23 insertions(+), 56 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 5b39918b7042b0..290b6f93b81628 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -143,6 +143,7 @@ config S390 select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select GENERIC_VDSO_TIME_NS + select GENERIC_IOREMAP if PCI select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index e3882b012bfa48..4453ad7c11aced 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -22,11 +22,18 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define IO_SPACE_LIMIT 0 -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot); -void __iomem *ioremap(phys_addr_t addr, size_t size); -void __iomem *ioremap_wc(phys_addr_t addr, size_t size); -void __iomem *ioremap_wt(phys_addr_t addr, size_t size); -void iounmap(volatile void __iomem *addr); +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap + +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL) + +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writecombine(PAGE_KERNEL))) +#define ioremap_wt(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(pgprot_writethrough(PAGE_KERNEL))) static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -51,10 +58,6 @@ static inline void ioport_unmap(void __iomem *p) #define pci_iomap_wc pci_iomap_wc #define pci_iomap_wc_range pci_iomap_wc_range -#define ioremap ioremap -#define ioremap_wt ioremap_wt -#define ioremap_wc ioremap_wc - #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index afc3f33788da95..d34d5813d00660 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -244,62 +244,25 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } -static void __iomem *__ioremap(phys_addr_t addr, size_t size, pgprot_t prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - unsigned long offset, vaddr; - struct vm_struct *area; - phys_addr_t last_addr; - - last_addr = addr + size - 1; - if (!size || last_addr < addr) - return NULL; - + /* + * When PCI MIO instructions are unavailable the "physical" address + * encodes a hint for accessing the PCI memory space it represents. + * Just pass it unchanged such that ioread/iowrite can decode it. + */ if (!static_branch_unlikely(&have_mio)) - return (void __iomem *) addr; + return (void __iomem *)phys_addr; - offset = addr & ~PAGE_MASK; - addr &= PAGE_MASK; - size = PAGE_ALIGN(size + offset); - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long) area->addr; - if (ioremap_page_range(vaddr, vaddr + size, addr, prot)) { - free_vm_area(area); - return NULL; - } - return (void __iomem *) ((unsigned long) area->addr + offset); -} - -void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot) -{ - return __ioremap(addr, size, __pgprot(prot)); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } EXPORT_SYMBOL(ioremap_prot); -void __iomem *ioremap(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(ioremap); - -void __iomem *ioremap_wc(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writecombine(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wc); - -void __iomem *ioremap_wt(phys_addr_t addr, size_t size) -{ - return __ioremap(addr, size, pgprot_writethrough(PAGE_KERNEL)); -} -EXPORT_SYMBOL(ioremap_wt); - void iounmap(volatile void __iomem *addr) { if (static_branch_likely(&have_mio)) - vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); From b94692e84dccf12dd30839c4e97b9ba028036a8f Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:12 +0800 Subject: [PATCH 149/489] sh: add including In , it provides a generic implementation of all I/O accessors. For some port|mm io functions, SuperH has its own implementation in arch/sh/kernel/iomap.c and arch/sh/include/asm/io_noioport.h. These will conflict with those in and cause compiling error. Hence add macro definitions to ensure that the SuperH version of them will override the generic version. [arnd@arndb.de: fix asm-generic/io.h inclusion] Link: https://lkml.kernel.org/r/20230802141658.2064864-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20230706154520.11257-12-bhe@redhat.com Signed-off-by: Baoquan He Signed-off-by: Arnd Bergmann Cc: John Paul Adrian Glaubitz Cc: Yoshinori Sato Cc: Rich Felker Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sh/include/asm/io.h | 49 +++++++++++++++++++++++++++++++ arch/sh/include/asm/io_noioport.h | 7 ----- 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index d8f3537ef57f9e..d8a0daf33ab594 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -119,6 +119,30 @@ void __raw_readsl(const void __iomem *addr, void *data, int longlen); __BUILD_MEMORY_STRING(__raw_, q, u64) +#define ioport_map ioport_map +#define ioport_unmap ioport_unmap +#define pci_iounmap pci_iounmap + +#define ioread8 ioread8 +#define ioread16 ioread16 +#define ioread16be ioread16be +#define ioread32 ioread32 +#define ioread32be ioread32be + +#define iowrite8 iowrite8 +#define iowrite16 iowrite16 +#define iowrite16be iowrite16be +#define iowrite32 iowrite32 +#define iowrite32be iowrite32be + +#define ioread8_rep ioread8_rep +#define ioread16_rep ioread16_rep +#define ioread32_rep ioread32_rep + +#define iowrite8_rep iowrite8_rep +#define iowrite16_rep iowrite16_rep +#define iowrite32_rep iowrite32_rep + #ifdef CONFIG_HAS_IOPORT_MAP /* @@ -221,10 +245,33 @@ __BUILD_IOPORT_STRING(q, u64) #endif +#define inb(addr) inb(addr) +#define inw(addr) inw(addr) +#define inl(addr) inl(addr) +#define outb(x, addr) outb((x), (addr)) +#define outw(x, addr) outw((x), (addr)) +#define outl(x, addr) outl((x), (addr)) + +#define inb_p(addr) inb(addr) +#define inw_p(addr) inw(addr) +#define inl_p(addr) inl(addr) +#define outb_p(x, addr) outb((x), (addr)) +#define outw_p(x, addr) outw((x), (addr)) +#define outl_p(x, addr) outl((x), (addr)) + +#define insb insb +#define insw insw +#define insl insl +#define outsb outsb +#define outsw outsw +#define outsl outsl #define IO_SPACE_LIMIT 0xffffffff /* We really want to try and get these to memcpy etc */ +#define memset_io memset_io +#define memcpy_fromio memcpy_fromio +#define memcpy_toio memcpy_toio void memcpy_fromio(void *, const volatile void __iomem *, unsigned long); void memcpy_toio(volatile void __iomem *, const void *, unsigned long); void memset_io(volatile void __iomem *, int, unsigned long); @@ -288,6 +335,8 @@ static inline void iounmap(volatile void __iomem *addr) { } #define xlate_dev_mem_ptr(p) __va(p) #define unxlate_dev_mem_ptr(p, v) do { } while (0) +#include + #define ARCH_HAS_VALID_PHYS_ADDR_RANGE int valid_phys_addr_range(phys_addr_t addr, size_t size); int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); diff --git a/arch/sh/include/asm/io_noioport.h b/arch/sh/include/asm/io_noioport.h index f7938fe0f91121..12dad91f41c1ef 100644 --- a/arch/sh/include/asm/io_noioport.h +++ b/arch/sh/include/asm/io_noioport.h @@ -46,13 +46,6 @@ static inline void ioport_unmap(void __iomem *addr) BUG(); } -#define inb_p(addr) inb(addr) -#define inw_p(addr) inw(addr) -#define inl_p(addr) inl(addr) -#define outb_p(x, addr) outb((x), (addr)) -#define outw_p(x, addr) outw((x), (addr)) -#define outl_p(x, addr) outl((x), (addr)) - static inline void insb(unsigned long port, void *dst, unsigned long count) { BUG(); From 0453c9a78015cb2219cda7239d881f4e3137bff8 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:13 +0800 Subject: [PATCH 150/489] sh: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for SuperH's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-13-bhe@redhat.com Signed-off-by: Baoquan He Cc: John Paul Adrian Glaubitz Cc: Yoshinori Sato Cc: Rich Felker Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/sh/Kconfig | 1 + arch/sh/include/asm/io.h | 40 +++++-------------------- arch/sh/mm/ioremap.c | 65 +++++++--------------------------------- 3 files changed, 20 insertions(+), 86 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2b3ce4fd39563d..6be32254211cc9 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -29,6 +29,7 @@ config SUPERH select GENERIC_SMP_IDLE_THREAD select GUP_GET_PXX_LOW_HIGH if X2TLB select HAS_IOPORT if HAS_IOPORT_MAP + select GENERIC_IOREMAP if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_KGDB select HAVE_ARCH_SECCOMP_FILTER diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index d8a0daf33ab594..f2f38e9d489ac4 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -290,40 +290,16 @@ unsigned long long poke_real_address_q(unsigned long long addr, #endif #ifdef CONFIG_MMU -void iounmap(void __iomem *addr); -void __iomem *__ioremap_caller(phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller); - -static inline void __iomem *ioremap(phys_addr_t offset, unsigned long size) -{ - return __ioremap_caller(offset, size, PAGE_KERNEL_NOCACHE, - __builtin_return_address(0)); -} - -static inline void __iomem * -ioremap_cache(phys_addr_t offset, unsigned long size) -{ - return __ioremap_caller(offset, size, PAGE_KERNEL, - __builtin_return_address(0)); -} -#define ioremap_cache ioremap_cache - -#ifdef CONFIG_HAVE_IOREMAP_PROT -static inline void __iomem *ioremap_prot(phys_addr_t offset, unsigned long size, - unsigned long flags) -{ - return __ioremap_caller(offset, size, __pgprot(flags), - __builtin_return_address(0)); -} -#endif /* CONFIG_HAVE_IOREMAP_PROT */ +/* + * I/O memory mapping functions. + */ +#define ioremap_prot ioremap_prot +#define iounmap iounmap -#else /* CONFIG_MMU */ -static inline void __iomem *ioremap(phys_addr_t offset, size_t size) -{ - return (void __iomem *)(unsigned long)offset; -} +#define _PAGE_IOREMAP pgprot_val(PAGE_KERNEL_NOCACHE) -static inline void iounmap(volatile void __iomem *addr) { } +#define ioremap_cache(addr, size) \ + ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) #endif /* CONFIG_MMU */ #define ioremap_uc ioremap diff --git a/arch/sh/mm/ioremap.c b/arch/sh/mm/ioremap.c index 21342581144dee..c33b3daa4ad1a3 100644 --- a/arch/sh/mm/ioremap.c +++ b/arch/sh/mm/ioremap.c @@ -72,22 +72,11 @@ __ioremap_29bit(phys_addr_t offset, unsigned long size, pgprot_t prot) #define __ioremap_29bit(offset, size, prot) NULL #endif /* CONFIG_29BIT */ -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem * __ref -__ioremap_caller(phys_addr_t phys_addr, unsigned long size, - pgprot_t pgprot, void *caller) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - struct vm_struct *area; - unsigned long offset, last_addr, addr, orig_addr; void __iomem *mapped; + pgprot_t pgprot = __pgprot(prot); mapped = __ioremap_trapped(phys_addr, size); if (mapped) @@ -97,11 +86,6 @@ __ioremap_caller(phys_addr_t phys_addr, unsigned long size, if (mapped) return mapped; - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - /* * If we can't yet use the regular approach, go the fixmap route. */ @@ -112,34 +96,14 @@ __ioremap_caller(phys_addr_t phys_addr, unsigned long size, * First try to remap through the PMB. * PMB entries are all pre-faulted. */ - mapped = pmb_remap_caller(phys_addr, size, pgprot, caller); + mapped = pmb_remap_caller(phys_addr, size, pgprot, + __builtin_return_address(0)); if (mapped && !IS_ERR(mapped)) return mapped; - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr+1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area_caller(size, VM_IOREMAP, caller); - if (!area) - return NULL; - area->phys_addr = phys_addr; - orig_addr = addr = (unsigned long)area->addr; - - if (ioremap_page_range(addr, addr + size, phys_addr, pgprot)) { - vunmap((void *)orig_addr); - return NULL; - } - - return (void __iomem *)(offset + (char *)orig_addr); + return generic_ioremap_prot(phys_addr, size, pgprot); } -EXPORT_SYMBOL(__ioremap_caller); +EXPORT_SYMBOL(ioremap_prot); /* * Simple checks for non-translatable mappings. @@ -158,10 +122,9 @@ static inline int iomapping_nontranslatable(unsigned long offset) return 0; } -void iounmap(void __iomem *addr) +void iounmap(volatile void __iomem *addr) { unsigned long vaddr = (unsigned long __force)addr; - struct vm_struct *p; /* * Nothing to do if there is no translatable mapping. @@ -172,21 +135,15 @@ void iounmap(void __iomem *addr) /* * There's no VMA if it's from an early fixed mapping. */ - if (iounmap_fixed(addr) == 0) + if (iounmap_fixed((void __iomem *)addr) == 0) return; /* * If the PMB handled it, there's nothing else to do. */ - if (pmb_unmap(addr) == 0) + if (pmb_unmap((void __iomem *)addr) == 0) return; - p = remove_vm_area((void *)(vaddr & PAGE_MASK)); - if (!p) { - printk(KERN_ERR "%s: bad address %p\n", __func__, addr); - return; - } - - kfree(p); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); From ca6c1af38128f19c8a388eecfde165210d2c6059 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:14 +0800 Subject: [PATCH 151/489] xtensa: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot(), ioremap() and iounmap() for xtensa's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-14-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Chris Zankel Cc: Max Filippov Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/io.h | 32 ++++++++------------ arch/xtensa/mm/ioremap.c | 58 +++++++++--------------------------- 3 files changed, 27 insertions(+), 64 deletions(-) diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 2a51a466779fe4..a5488cc40f587b 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -28,6 +28,7 @@ config XTENSA select GENERIC_LIB_UCMPDI2 select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK + select GENERIC_IOREMAP if MMU select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL diff --git a/arch/xtensa/include/asm/io.h b/arch/xtensa/include/asm/io.h index a5b707e1c0f476..934e58399c8c0d 100644 --- a/arch/xtensa/include/asm/io.h +++ b/arch/xtensa/include/asm/io.h @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -24,22 +25,24 @@ #define PCI_IOBASE ((void __iomem *)XCHAL_KIO_BYPASS_VADDR) #ifdef CONFIG_MMU - -void __iomem *xtensa_ioremap_nocache(unsigned long addr, unsigned long size); -void __iomem *xtensa_ioremap_cache(unsigned long addr, unsigned long size); -void xtensa_iounmap(volatile void __iomem *addr); - /* - * Return the virtual address for the specified bus memory. + * I/O memory mapping functions. */ +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot); +#define ioremap_prot ioremap_prot +#define iounmap iounmap + static inline void __iomem *ioremap(unsigned long offset, unsigned long size) { if (offset >= XCHAL_KIO_PADDR && offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE) return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_BYPASS_VADDR); else - return xtensa_ioremap_nocache(offset, size); + return ioremap_prot(offset, size, + pgprot_val(pgprot_noncached(PAGE_KERNEL))); } +#define ioremap ioremap static inline void __iomem *ioremap_cache(unsigned long offset, unsigned long size) @@ -48,21 +51,10 @@ static inline void __iomem *ioremap_cache(unsigned long offset, && offset - XCHAL_KIO_PADDR < XCHAL_KIO_SIZE) return (void*)(offset-XCHAL_KIO_PADDR+XCHAL_KIO_CACHED_VADDR); else - return xtensa_ioremap_cache(offset, size); -} -#define ioremap_cache ioremap_cache + return ioremap_prot(offset, size, pgprot_val(PAGE_KERNEL)); -static inline void iounmap(volatile void __iomem *addr) -{ - unsigned long va = (unsigned long) addr; - - if (!(va >= XCHAL_KIO_CACHED_VADDR && - va - XCHAL_KIO_CACHED_VADDR < XCHAL_KIO_SIZE) && - !(va >= XCHAL_KIO_BYPASS_VADDR && - va - XCHAL_KIO_BYPASS_VADDR < XCHAL_KIO_SIZE)) - xtensa_iounmap(addr); } - +#define ioremap_cache ioremap_cache #endif /* CONFIG_MMU */ #include diff --git a/arch/xtensa/mm/ioremap.c b/arch/xtensa/mm/ioremap.c index a400188c16b906..8ca660b7ab49a2 100644 --- a/arch/xtensa/mm/ioremap.c +++ b/arch/xtensa/mm/ioremap.c @@ -6,60 +6,30 @@ */ #include -#include #include #include #include -static void __iomem *xtensa_ioremap(unsigned long paddr, unsigned long size, - pgprot_t prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - unsigned long offset = paddr & ~PAGE_MASK; - unsigned long pfn = __phys_to_pfn(paddr); - struct vm_struct *area; - unsigned long vaddr; - int err; - - paddr &= PAGE_MASK; - + unsigned long pfn = __phys_to_pfn((phys_addr)); WARN_ON(pfn_valid(pfn)); - size = PAGE_ALIGN(offset + size); - - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - vaddr = (unsigned long)area->addr; - area->phys_addr = paddr; - - err = ioremap_page_range(vaddr, vaddr + size, paddr, prot); - - if (err) { - vunmap((void *)vaddr); - return NULL; - } - - flush_cache_vmap(vaddr, vaddr + size); - return (void __iomem *)(offset + vaddr); -} - -void __iomem *xtensa_ioremap_nocache(unsigned long addr, unsigned long size) -{ - return xtensa_ioremap(addr, size, pgprot_noncached(PAGE_KERNEL)); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } -EXPORT_SYMBOL(xtensa_ioremap_nocache); +EXPORT_SYMBOL(ioremap_prot); -void __iomem *xtensa_ioremap_cache(unsigned long addr, unsigned long size) +void iounmap(volatile void __iomem *addr) { - return xtensa_ioremap(addr, size, PAGE_KERNEL); -} -EXPORT_SYMBOL(xtensa_ioremap_cache); + unsigned long va = (unsigned long) addr; -void xtensa_iounmap(volatile void __iomem *io_addr) -{ - void *addr = (void *)(PAGE_MASK & (unsigned long)io_addr); + if ((va >= XCHAL_KIO_CACHED_VADDR && + va - XCHAL_KIO_CACHED_VADDR < XCHAL_KIO_SIZE) || + (va >= XCHAL_KIO_BYPASS_VADDR && + va - XCHAL_KIO_BYPASS_VADDR < XCHAL_KIO_SIZE)) + return; - vunmap(addr); + generic_iounmap(addr); } -EXPORT_SYMBOL(xtensa_iounmap); +EXPORT_SYMBOL(iounmap); From 426b313f356a314af335899c51250dc0f49cd4a7 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:15 +0800 Subject: [PATCH 152/489] parisc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper function ioremap_prot() for parisc's special operation when iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-15-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Acked-by: Helge Deller Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/io.h | 15 ++++++--- arch/parisc/mm/ioremap.c | 62 +++--------------------------------- 3 files changed, 15 insertions(+), 63 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 4cb46d5c64a270..4cda9f0a3277cf 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -36,6 +36,7 @@ config PARISC select GENERIC_ATOMIC64 if !64BIT select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP + select GENERIC_IOREMAP select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index c05e781be2f5bd..366537042465a1 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -125,12 +125,17 @@ static inline void gsc_writeq(unsigned long long val, unsigned long addr) /* * The standard PCI ioremap interfaces */ -void __iomem *ioremap(unsigned long offset, unsigned long size); -#define ioremap_wc ioremap -#define ioremap_uc ioremap -#define pci_iounmap pci_iounmap +#define ioremap_prot ioremap_prot + +#define _PAGE_IOREMAP (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | \ + _PAGE_ACCESSED | _PAGE_NO_CACHE) -extern void iounmap(const volatile void __iomem *addr); +#define ioremap_wc(addr, size) \ + ioremap_prot((addr), (size), _PAGE_IOREMAP) +#define ioremap_uc(addr, size) \ + ioremap_prot((addr), (size), _PAGE_IOREMAP) + +#define pci_iounmap pci_iounmap void memset_io(volatile void __iomem *addr, unsigned char val, int count); void memcpy_fromio(void *dst, const volatile void __iomem *src, int count); diff --git a/arch/parisc/mm/ioremap.c b/arch/parisc/mm/ioremap.c index 345ff0b6649935..fd996472dfe72b 100644 --- a/arch/parisc/mm/ioremap.c +++ b/arch/parisc/mm/ioremap.c @@ -13,25 +13,9 @@ #include #include -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void __iomem *ioremap(unsigned long phys_addr, unsigned long size) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { - void __iomem *addr; - struct vm_struct *area; - unsigned long offset, last_addr; - pgprot_t pgprot; - #ifdef CONFIG_EISA unsigned long end = phys_addr + size - 1; /* Support EISA addresses */ @@ -40,11 +24,6 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size) phys_addr |= F_EXTEND(0xfc000000); #endif - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - /* * Don't allow anybody to remap normal RAM that we're using.. */ @@ -62,39 +41,6 @@ void __iomem *ioremap(unsigned long phys_addr, unsigned long size) } } - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | - _PAGE_ACCESSED | _PAGE_NO_CACHE); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - vunmap(addr); - return NULL; - } - - return (void __iomem *) (offset + (char __iomem *)addr); -} -EXPORT_SYMBOL(ioremap); - -void iounmap(const volatile void __iomem *io_addr) -{ - unsigned long addr = (unsigned long)io_addr & PAGE_MASK; - - if (is_vmalloc_addr((void *)addr)) - vunmap((void *)addr); + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } -EXPORT_SYMBOL(iounmap); +EXPORT_SYMBOL(ioremap_prot); From ab1cd02083d046a25b48a2cad71ace6d5ddf0e9e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:16 +0800 Subject: [PATCH 153/489] mm/ioremap: consider IOREMAP space in generic ioremap Architectures like powerpc have a dedicated space for IOREMAP mappings. If so, use it in generic_ioremap_prot(). Link: https://lkml.kernel.org/r/20230706154520.11257-16-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- mm/ioremap.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/ioremap.c b/mm/ioremap.c index 86b82ec27d2bb4..68d9895144ad07 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -11,6 +11,15 @@ #include #include +/* + * Ioremap often, but not always uses the generic vmalloc area. E.g on + * Power ARCH, it could have different ioremap space. + */ +#ifndef IOREMAP_START +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END +#endif + void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot) { @@ -35,8 +44,8 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, if (!ioremap_allowed(phys_addr, size, pgprot_val(prot))) return NULL; - area = get_vm_area_caller(size, VM_IOREMAP, - __builtin_return_address(0)); + area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, + IOREMAP_END, __builtin_return_address(0)); if (!area) return NULL; vaddr = (unsigned long)area->addr; @@ -66,7 +75,7 @@ void generic_iounmap(volatile void __iomem *addr) if (!iounmap_allowed(vaddr)) return; - if (is_vmalloc_addr(vaddr)) + if (is_ioremap_addr(vaddr)) vunmap(vaddr); } From 016fec91013cfb27c9f5a101a87ae8266537fed1 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:17 +0800 Subject: [PATCH 154/489] mm: move is_ioremap_addr() into new header file Now is_ioremap_addr() is only used in kernel/iomem.c and gonna be used in mm/ioremap.c. Move it into its own new header file linux/ioremap.h. Link: https://lkml.kernel.org/r/20230706154520.11257-17-bhe@redhat.com Suggested-by: Christoph Hellwig Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Mike Rapoport (IBM) Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/pgtable.h | 10 ---------- include/linux/ioremap.h | 30 ++++++++++++++++++++++++++++++ include/linux/mm.h | 5 ----- kernel/iomem.c | 1 + mm/ioremap.c | 10 +--------- 5 files changed, 32 insertions(+), 24 deletions(-) create mode 100644 include/linux/ioremap.h diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 6a88bfdaa69b6f..445a22987aa3a3 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,16 +157,6 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } -#ifdef CONFIG_PPC64 -#define is_ioremap_addr is_ioremap_addr -static inline bool is_ioremap_addr(const void *x) -{ - unsigned long addr = (unsigned long)x; - - return addr >= IOREMAP_BASE && addr < IOREMAP_END; -} -#endif /* CONFIG_PPC64 */ - #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/include/linux/ioremap.h b/include/linux/ioremap.h new file mode 100644 index 00000000000000..f0e99fc7dd8b23 --- /dev/null +++ b/include/linux/ioremap.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_IOREMAP_H +#define _LINUX_IOREMAP_H + +#include +#include + +#if defined(CONFIG_HAS_IOMEM) || defined(CONFIG_GENERIC_IOREMAP) +/* + * Ioremap often, but not always uses the generic vmalloc area. E.g on + * Power ARCH, it could have different ioremap space. + */ +#ifndef IOREMAP_START +#define IOREMAP_START VMALLOC_START +#define IOREMAP_END VMALLOC_END +#endif +static inline bool is_ioremap_addr(const void *x) +{ + unsigned long addr = (unsigned long)kasan_reset_tag(x); + + return addr >= IOREMAP_START && addr < IOREMAP_END; +} +#else +static inline bool is_ioremap_addr(const void *x) +{ + return false; +} +#endif + +#endif /* _LINUX_IOREMAP_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index bfb46483108cdd..0ae5654f665b7a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1055,11 +1055,6 @@ unsigned long vmalloc_to_pfn(const void *addr); * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ - -#ifndef is_ioremap_addr -#define is_ioremap_addr(x) is_vmalloc_addr(x) -#endif - #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); diff --git a/kernel/iomem.c b/kernel/iomem.c index 62c92e43aa0d44..9682471e647171 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -3,6 +3,7 @@ #include #include #include +#include #ifndef ioremap_cache /* temporary while we convert existing ioremap_cache users to memremap */ diff --git a/mm/ioremap.c b/mm/ioremap.c index 68d9895144ad07..a21a6c9fa5abd2 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -10,15 +10,7 @@ #include #include #include - -/* - * Ioremap often, but not always uses the generic vmalloc area. E.g on - * Power ARCH, it could have different ioremap space. - */ -#ifndef IOREMAP_START -#define IOREMAP_START VMALLOC_START -#define IOREMAP_END VMALLOC_END -#endif +#include void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot) From 8d05554dca2af6a77dc38a152f4e84fdf7e35179 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 6 Jul 2023 23:45:18 +0800 Subject: [PATCH 155/489] powerpc: mm: convert to GENERIC_IOREMAP By taking GENERIC_IOREMAP method, the generic generic_ioremap_prot(), generic_iounmap(), and their generic wrapper ioremap_prot(), ioremap() and iounmap() are all visible and available to arch. Arch needs to provide wrapper functions to override the generic versions if there's arch specific handling in its ioremap_prot(), ioremap() or iounmap(). This change will simplify implementation by removing duplicated code with generic_ioremap_prot() and generic_iounmap(), and has the equivalent functioality as before. Here, add wrapper functions ioremap_prot() and iounmap() for powerpc's special operation when ioremap() and iounmap(). Link: https://lkml.kernel.org/r/20230706154520.11257-18-bhe@redhat.com Signed-off-by: Christophe Leroy Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Max Filippov Cc: Nathan Chancellor Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/io.h | 8 +++----- arch/powerpc/mm/ioremap.c | 26 +------------------------- arch/powerpc/mm/ioremap_32.c | 19 +++++++++---------- arch/powerpc/mm/ioremap_64.c | 12 ++---------- 5 files changed, 16 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 0b1172cbeccb30..9222c138c45702 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -193,6 +193,7 @@ config PPC select GENERIC_CPU_VULNERABILITIES if PPC_BARRIER_NOSPEC select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY + select GENERIC_IOREMAP select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL select GENERIC_PCI_IOMAP if PCI diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 67a3fb6de498ef..0732b743e09962 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -889,8 +889,8 @@ static inline void iosync(void) * */ extern void __iomem *ioremap(phys_addr_t address, unsigned long size); -extern void __iomem *ioremap_prot(phys_addr_t address, unsigned long size, - unsigned long flags); +#define ioremap ioremap +#define ioremap_prot ioremap_prot extern void __iomem *ioremap_wc(phys_addr_t address, unsigned long size); #define ioremap_wc ioremap_wc @@ -904,14 +904,12 @@ void __iomem *ioremap_coherent(phys_addr_t address, unsigned long size); #define ioremap_cache(addr, size) \ ioremap_prot((addr), (size), pgprot_val(PAGE_KERNEL)) -extern void iounmap(volatile void __iomem *addr); +#define iounmap iounmap void __iomem *ioremap_phb(phys_addr_t paddr, unsigned long size); int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long size, pgprot_t prot); -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller); extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, pgprot_t prot, void *caller); diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 4f12504fb405b1..705e8e8ffde4d7 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -41,7 +41,7 @@ void __iomem *ioremap_coherent(phys_addr_t addr, unsigned long size) return __ioremap_caller(addr, size, prot, caller); } -void __iomem *ioremap_prot(phys_addr_t addr, unsigned long size, unsigned long flags) +void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) { pte_t pte = __pte(flags); void *caller = __builtin_return_address(0); @@ -74,27 +74,3 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, return 0; } - -void __iomem *do_ioremap(phys_addr_t pa, phys_addr_t offset, unsigned long size, - pgprot_t prot, void *caller) -{ - struct vm_struct *area; - int ret; - unsigned long va; - - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, caller); - if (area == NULL) - return NULL; - - area->phys_addr = pa; - va = (unsigned long)area->addr; - - ret = ioremap_page_range(va, va + size, pa, prot); - if (!ret) - return (void __iomem *)area->addr + offset; - - vunmap_range(va, va + size); - free_vm_area(area); - - return NULL; -} diff --git a/arch/powerpc/mm/ioremap_32.c b/arch/powerpc/mm/ioremap_32.c index 9d13143b8be496..ca5bc6be3e6f99 100644 --- a/arch/powerpc/mm/ioremap_32.c +++ b/arch/powerpc/mm/ioremap_32.c @@ -21,6 +21,13 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call phys_addr_t p, offset; int err; + /* + * If the address lies within the first 16 MB, assume it's in ISA + * memory space + */ + if (addr < SZ_16M) + addr += _ISA_MEM_BASE; + /* * Choose an address to map it to. * Once the vmalloc system is running, we use it. @@ -31,13 +38,6 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call offset = addr & ~PAGE_MASK; size = PAGE_ALIGN(addr + size) - p; - /* - * If the address lies within the first 16 MB, assume it's in ISA - * memory space - */ - if (p < 16 * 1024 * 1024) - p += _ISA_MEM_BASE; - #ifndef CONFIG_CRASH_DUMP /* * Don't allow anybody to remap normal RAM that we're using. @@ -63,7 +63,7 @@ __ioremap_caller(phys_addr_t addr, unsigned long size, pgprot_t prot, void *call return (void __iomem *)v + offset; if (slab_is_available()) - return do_ioremap(p, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); /* * Should check if it is a candidate for a BAT mapping @@ -87,7 +87,6 @@ void iounmap(volatile void __iomem *addr) if (v_block_mapped((unsigned long)addr)) return; - if (addr > high_memory && (unsigned long)addr < ioremap_bot) - vunmap((void *)(PAGE_MASK & (unsigned long)addr)); + generic_iounmap(addr); } EXPORT_SYMBOL(iounmap); diff --git a/arch/powerpc/mm/ioremap_64.c b/arch/powerpc/mm/ioremap_64.c index 3acece00b33e87..d24e5f1667237d 100644 --- a/arch/powerpc/mm/ioremap_64.c +++ b/arch/powerpc/mm/ioremap_64.c @@ -29,7 +29,7 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, return NULL; if (slab_is_available()) - return do_ioremap(paligned, offset, size, prot, caller); + return generic_ioremap_prot(addr, size, prot); pr_warn("ioremap() called early from %pS. Use early_ioremap() instead\n", caller); @@ -49,17 +49,9 @@ void __iomem *__ioremap_caller(phys_addr_t addr, unsigned long size, */ void iounmap(volatile void __iomem *token) { - void *addr; - if (!slab_is_available()) return; - addr = (void *)((unsigned long __force)PCI_FIX_ADDR(token) & PAGE_MASK); - - if ((unsigned long)addr < ioremap_bot) { - pr_warn("Attempt to iounmap early bolted mapping at 0x%p\n", addr); - return; - } - vunmap(addr); + generic_iounmap(PCI_FIX_ADDR(token)); } EXPORT_SYMBOL(iounmap); From 8f03d74f716313892908819a389ff9ede483c3a5 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:19 +0800 Subject: [PATCH 156/489] arm64 : mm: add wrapper function ioremap_prot() Since hook functions ioremap_allowed() and iounmap_allowed() will be obsoleted, add wrapper function ioremap_prot() to contain the the specific handling in addition to generic_ioremap_prot() invocation. Link: https://lkml.kernel.org/r/20230706154520.11257-19-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Acked-by: Catalin Marinas Cc: Catalin Marinas Cc: Will Deacon Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/io.h | 3 +-- arch/arm64/mm/ioremap.c | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 51d92abf945edd..3b694511b98f83 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -139,8 +139,7 @@ extern void __memset_io(volatile void __iomem *, int, size_t); * I/O memory mapping functions. */ -bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot); -#define ioremap_allowed ioremap_allowed +#define ioremap_prot ioremap_prot #define _PAGE_IOREMAP PROT_DEVICE_nGnRE diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index c5af103d4ad468..269f2f63ab7dc4 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -3,20 +3,22 @@ #include #include -bool ioremap_allowed(phys_addr_t phys_addr, size_t size, unsigned long prot) +void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size, + unsigned long prot) { unsigned long last_addr = phys_addr + size - 1; /* Don't allow outside PHYS_MASK */ if (last_addr & ~PHYS_MASK) - return false; + return NULL; /* Don't allow RAM to be mapped. */ if (WARN_ON(pfn_is_map_memory(__phys_to_pfn(phys_addr)))) - return false; + return NULL; - return true; + return generic_ioremap_prot(phys_addr, size, __pgprot(prot)); } +EXPORT_SYMBOL(ioremap_prot); /* * Must be called after early_fixmap_init From 95da27c4c6dd735c4f2798aca9095c086cf48faf Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 6 Jul 2023 23:45:20 +0800 Subject: [PATCH 157/489] mm: ioremap: remove unneeded ioremap_allowed and iounmap_allowed Now there are no users of ioremap_allowed and iounmap_allowed, clean them up. Link: https://lkml.kernel.org/r/20230706154520.11257-20-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Christoph Hellwig Reviewed-by: Kefeng Wang Reviewed-by: Mike Rapoport (IBM) Cc: Alexander Gordeev Cc: Arnd Bergmann Cc: Brian Cain Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Chris Zankel Cc: David Laight Cc: Geert Uytterhoeven Cc: Gerald Schaefer Cc: Heiko Carstens Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Max Filippov Cc: Michael Ellerman Cc: Nathan Chancellor Cc: Nicholas Piggin Cc: Niklas Schnelle Cc: Rich Felker Cc: Stafford Horne Cc: Stefan Kristiansson Cc: Sven Schnelle Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/io.h | 26 -------------------------- mm/ioremap.c | 6 ------ 2 files changed, 32 deletions(-) diff --git a/include/asm-generic/io.h b/include/asm-generic/io.h index 39244c3ee79768..bac63e874c7bf9 100644 --- a/include/asm-generic/io.h +++ b/include/asm-generic/io.h @@ -1047,32 +1047,6 @@ static inline void iounmap(volatile void __iomem *addr) #elif defined(CONFIG_GENERIC_IOREMAP) #include -/* - * Arch code can implement the following two hooks when using GENERIC_IOREMAP - * ioremap_allowed() return a bool, - * - true means continue to remap - * - false means skip remap and return directly - * iounmap_allowed() return a bool, - * - true means continue to vunmap - * - false means skip vunmap and return directly - */ -#ifndef ioremap_allowed -#define ioremap_allowed ioremap_allowed -static inline bool ioremap_allowed(phys_addr_t phys_addr, size_t size, - unsigned long prot) -{ - return true; -} -#endif - -#ifndef iounmap_allowed -#define iounmap_allowed iounmap_allowed -static inline bool iounmap_allowed(void *addr) -{ - return true; -} -#endif - void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, pgprot_t prot); diff --git a/mm/ioremap.c b/mm/ioremap.c index a21a6c9fa5abd2..3e049dfb28bd0d 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -33,9 +33,6 @@ void __iomem *generic_ioremap_prot(phys_addr_t phys_addr, size_t size, phys_addr -= offset; size = PAGE_ALIGN(size + offset); - if (!ioremap_allowed(phys_addr, size, pgprot_val(prot))) - return NULL; - area = __get_vm_area_caller(size, VM_IOREMAP, IOREMAP_START, IOREMAP_END, __builtin_return_address(0)); if (!area) @@ -64,9 +61,6 @@ void generic_iounmap(volatile void __iomem *addr) { void *vaddr = (void *)((unsigned long)addr & PAGE_MASK); - if (!iounmap_allowed(vaddr)) - return; - if (is_ioremap_addr(vaddr)) vunmap(vaddr); } From 65c8d30e679bdffeeaa0b84b7094a3c719aa6585 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Mon, 17 Jul 2023 21:10:01 +0800 Subject: [PATCH 158/489] mm/tlbbatch: introduce arch_tlbbatch_should_defer() Patch series "arm64: support batched/deferred tlb shootdown during page reclamation/migration", v11. Though ARM64 has the hardware to do tlb shootdown, the hardware broadcasting is not free. A simplest micro benchmark shows even on snapdragon 888 with only 8 cores, the overhead for ptep_clear_flush is huge even for paging out one page mapped by only one process: 5.36% a.out [kernel.kallsyms] [k] ptep_clear_flush While pages are mapped by multiple processes or HW has more CPUs, the cost should become even higher due to the bad scalability of tlb shootdown. The same benchmark can result in 16.99% CPU consumption on ARM64 server with around 100 cores according to the test on patch 4/4. This patchset leverages the existing BATCHED_UNMAP_TLB_FLUSH by 1. only send tlbi instructions in the first stage - arch_tlbbatch_add_mm() 2. wait for the completion of tlbi by dsb while doing tlbbatch sync in arch_tlbbatch_flush() Testing on snapdragon shows the overhead of ptep_clear_flush is removed by the patchset. The micro benchmark becomes 5% faster even for one page mapped by single process on snapdragon 888. Since BATCHED_UNMAP_TLB_FLUSH is implemented only on x86, the patchset does some renaming/extension for the current implementation first (Patch 1-3), then add the support on arm64 (Patch 4). This patch (of 4): The entire scheme of deferred TLB flush in reclaim path rests on the fact that the cost to refill TLB entries is less than flushing out individual entries by sending IPI to remote CPUs. But architecture can have different ways to evaluate that. Hence apart from checking TTU_BATCH_FLUSH in the TTU flags, rest of the decision should be architecture specific. [yangyicong@hisilicon.com: rebase and fix incorrect return value type] Link: https://lkml.kernel.org/r/20230717131004.12662-1-yangyicong@huawei.com Link: https://lkml.kernel.org/r/20230717131004.12662-2-yangyicong@huawei.com Signed-off-by: Anshuman Khandual [https://lore.kernel.org/linuxppc-dev/20171101101735.2318-2-khandual@linux.vnet.ibm.com/] Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Anshuman Khandual Reviewed-by: Barry Song Reviewed-by: Xin Hao Tested-by: Punit Agrawal Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: Darren Hart Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Cc: Barry Song Cc: Mel Gorman Cc: Nadav Amit Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 12 ++++++++++++ mm/rmap.c | 9 +-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 80450e1d5385aa..cf2a1de5d38873 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -253,6 +253,18 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ + bool should_defer = false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* diff --git a/mm/rmap.c b/mm/rmap.c index 2668f5ea353428..7a479e22d288c8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -688,17 +688,10 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) */ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) { - bool should_defer = false; - if (!(flags & TTU_BATCH_FLUSH)) return false; - /* If remote CPUs need to be flushed then defer batch the flush */ - if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) - should_defer = true; - put_cpu(); - - return should_defer; + return arch_tlbbatch_should_defer(mm); } /* From f73419bb89d606de9be2043febf0957d56627a5b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:02 +0800 Subject: [PATCH 159/489] mm/tlbbatch: rename and extend some functions This patch does some preparation works to extend batched TLB flush to arm64. Including: - Extend set_tlb_ubc_flush_pending() and arch_tlbbatch_add_mm() to accept an additional argument for address, architectures like arm64 may need this for tlbi. - Rename arch_tlbbatch_add_mm() to arch_tlbbatch_add_pending() to match its current function since we don't need to handle mm on architectures like arm64 and add_mm is not proper, add_pending will make sense to both as on x86 we're pending the TLB flush operations while on arm64 we're pending the synchronize operations. This intends no functional changes on x86. Link: https://lkml.kernel.org/r/20230717131004.12662-3-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 5 +++-- include/linux/mm_types_task.h | 4 ++-- mm/rmap.c | 12 +++++++----- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf2a1de5d38873..1c7d3a36e16cca 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -276,8 +276,9 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) return atomic64_inc_return(&mm->context.tlb_gen); } -static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, - struct mm_struct *mm) +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index 5414b5c6a10312..aa44fff8bb9da2 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -52,8 +52,8 @@ struct tlbflush_unmap_batch { #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * The arch code makes the following promise: generic code can modify a - * PTE, then call arch_tlbbatch_add_mm() (which internally provides all - * needed barriers), then call arch_tlbbatch_flush(), and the entries + * PTE, then call arch_tlbbatch_add_pending() (which internally provides + * all needed barriers), then call arch_tlbbatch_flush(), and the entries * will be flushed on all CPUs by the time that arch_tlbbatch_flush() * returns. */ diff --git a/mm/rmap.c b/mm/rmap.c index 7a479e22d288c8..f6fb821d56a82c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -642,7 +642,8 @@ void try_to_unmap_flush_dirty(void) #define TLB_FLUSH_BATCH_PENDING_LARGE \ (TLB_FLUSH_BATCH_PENDING_MASK / 2) -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, + unsigned long uaddr) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; int batch; @@ -651,7 +652,7 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) if (!pte_accessible(mm, pteval)) return; - arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); + arch_tlbbatch_add_pending(&tlb_ubc->arch, mm, uaddr); tlb_ubc->flush_required = true; /* @@ -726,7 +727,8 @@ void flush_tlb_batched_pending(struct mm_struct *mm) } } #else -static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval) +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, pte_t pteval, + unsigned long uaddr) { } @@ -1579,7 +1581,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval); + set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } @@ -1962,7 +1964,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ pteval = ptep_get_and_clear(mm, address, pvmw.pte); - set_tlb_ubc_flush_pending(mm, pteval); + set_tlb_ubc_flush_pending(mm, pteval, address); } else { pteval = ptep_clear_flush(vma, address, pvmw.pte); } From db6c1f6f236dbcd271d51d37675bbccfcea7c7be Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Mon, 17 Jul 2023 21:10:03 +0800 Subject: [PATCH 160/489] mm/tlbbatch: introduce arch_flush_tlb_batched_pending() Currently we'll flush the mm in flush_tlb_batched_pending() to avoid race between reclaim unmaps pages by batched TLB flush and mprotect/munmap/etc. Other architectures like arm64 may only need a synchronization barrier(dsb) here rather than a full mm flush. So add arch_flush_tlb_batched_pending() to allow an arch-specific implementation here. This intends no functional changes on x86 since still a full mm flush for x86. Link: https://lkml.kernel.org/r/20230717131004.12662-4-yangyicong@huawei.com Signed-off-by: Yicong Yang Reviewed-by: Catalin Marinas Cc: Anshuman Khandual Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: Jonathan Corbet Cc: Kefeng Wang Cc: lipeifeng Cc: Mark Rutland Cc: Mel Gorman Cc: Nadav Amit Cc: Peter Zijlstra Cc: Punit Agrawal Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Xin Hao Cc: Zeng Tao Signed-off-by: Andrew Morton --- arch/x86/include/asm/tlbflush.h | 5 +++++ mm/rmap.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1c7d3a36e16cca..837e4a50281a06 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -284,6 +284,11 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + flush_tlb_mm(mm); +} + extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); static inline bool pte_flags_need_flush(unsigned long oldflags, diff --git a/mm/rmap.c b/mm/rmap.c index f6fb821d56a82c..5717517e404089 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -717,7 +717,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm) int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; if (pending != flushed) { - flush_tlb_mm(mm); + arch_flush_tlb_batched_pending(mm); /* * If the new TLB flushing is pending during flushing, leave * mm->tlb_flush_batched as is, to avoid losing flushing. From 43b3dfdd04553171488cb11d46d21948b6b90e27 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 17 Jul 2023 21:10:04 +0800 Subject: [PATCH 161/489] arm64: support batched/deferred tlb shootdown during page reclamation/migration On x86, batched and deferred tlb shootdown has lead to 90% performance increase on tlb shootdown. on arm64, HW can do tlb shootdown without software IPI. But sync tlbi is still quite expensive. Even running a simplest program which requires swapout can prove this is true, #include #include #include #include int main() { #define SIZE (1 * 1024 * 1024) volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); memset(p, 0x88, SIZE); for (int k = 0; k < 10000; k++) { /* swap in */ for (int i = 0; i < SIZE; i += 4096) { (void)p[i]; } /* swap out */ madvise(p, SIZE, MADV_PAGEOUT); } } Perf result on snapdragon 888 with 8 cores by using zRAM as the swap block device. ~ # perf record taskset -c 4 ./a.out [ perf record: Woken up 10 times to write data ] [ perf record: Captured and wrote 2.297 MB perf.data (60084 samples) ] ~ # perf report # To display the perf.data header info, please use --header/--header-only options. # To display the perf.data header info, please use --header/--header-only options. # # # Total Lost Samples: 0 # # Samples: 60K of event 'cycles' # Event count (approx.): 35706225414 # # Overhead Command Shared Object Symbol # ........ ....... ................. ...... # 21.07% a.out [kernel.kallsyms] [k] _raw_spin_unlock_irq 8.23% a.out [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore 6.67% a.out [kernel.kallsyms] [k] filemap_map_pages 6.16% a.out [kernel.kallsyms] [k] __zram_bvec_write 5.36% a.out [kernel.kallsyms] [k] ptep_clear_flush 3.71% a.out [kernel.kallsyms] [k] _raw_spin_lock 3.49% a.out [kernel.kallsyms] [k] memset64 1.63% a.out [kernel.kallsyms] [k] clear_page 1.42% a.out [kernel.kallsyms] [k] _raw_spin_unlock 1.26% a.out [kernel.kallsyms] [k] mod_zone_state.llvm.8525150236079521930 1.23% a.out [kernel.kallsyms] [k] xas_load 1.15% a.out [kernel.kallsyms] [k] zram_slot_lock ptep_clear_flush() takes 5.36% CPU in the micro-benchmark swapping in/out a page mapped by only one process. If the page is mapped by multiple processes, typically, like more than 100 on a phone, the overhead would be much higher as we have to run tlb flush 100 times for one single page. Plus, tlb flush overhead will increase with the number of CPU cores due to the bad scalability of tlb shootdown in HW, so those ARM64 servers should expect much higher overhead. Further perf annonate shows 95% cpu time of ptep_clear_flush is actually used by the final dsb() to wait for the completion of tlb flush. This provides us a very good chance to leverage the existing batched tlb in kernel. The minimum modification is that we only send async tlbi in the first stage and we send dsb while we have to sync in the second stage. With the above simplest micro benchmark, collapsed time to finish the program decreases around 5%. Typical collapsed time w/o patch: ~ # time taskset -c 4 ./a.out 0.21user 14.34system 0:14.69elapsed w/ patch: ~ # time taskset -c 4 ./a.out 0.22user 13.45system 0:13.80elapsed Also tested with benchmark in the commit on Kunpeng920 arm64 server and observed an improvement around 12.5% with command `time ./swap_bench`. w/o w/ real 0m13.460s 0m11.771s user 0m0.248s 0m0.279s sys 0m12.039s 0m11.458s Originally it's noticed a 16.99% overhead of ptep_clear_flush() which has been eliminated by this patch: [root@localhost yang]# perf record -- ./swap_bench && perf report [...] 16.99% swap_bench [kernel.kallsyms] [k] ptep_clear_flush It is tested on 4,8,128 CPU platforms and shows to be beneficial on large systems but may not have improvement on small systems like on a 4 CPU platform. Also this patch improve the performance of page migration. Using pmbench and tries to migrate the pages of pmbench between node 0 and node 1 for 100 times for 1G memory, this patch decrease the time used around 20% (prev 18.338318910 sec after 13.981866350 sec) and saved the time used by ptep_clear_flush(). Link: https://lkml.kernel.org/r/20230717131004.12662-5-yangyicong@huawei.com Tested-by: Yicong Yang Tested-by: Xin Hao Tested-by: Punit Agrawal Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Kefeng Wang Reviewed-by: Xin Hao Reviewed-by: Anshuman Khandual Reviewed-by: Catalin Marinas Cc: Anshuman Khandual Cc: Jonathan Corbet Cc: Nadav Amit Cc: Mel Gorman Cc: Anshuman Khandual Cc: Arnd Bergmann Cc: Barry Song Cc: Darren Hart Cc: Jonathan Cameron Cc: lipeifeng Cc: Mark Rutland Cc: Peter Zijlstra Cc: Ryan Roberts Cc: Steven Miao Cc: Will Deacon Cc: Zeng Tao Signed-off-by: Andrew Morton --- .../features/vm/TLB/arch-support.txt | 2 +- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/tlbbatch.h | 12 +++++ arch/arm64/include/asm/tlbflush.h | 44 +++++++++++++++++-- 4 files changed, 55 insertions(+), 4 deletions(-) create mode 100644 arch/arm64/include/asm/tlbbatch.h diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 7f049c251a79e4..76208db88f3bef 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -9,7 +9,7 @@ | alpha: | TODO | | arc: | TODO | | arm: | TODO | - | arm64: | N/A | + | arm64: | ok | | csky: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index a2511b30d0f676..751d8c8821dbe0 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -96,6 +96,7 @@ config ARM64 select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_SUPPORTS_PAGE_TABLE_CHECK select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT diff --git a/arch/arm64/include/asm/tlbbatch.h b/arch/arm64/include/asm/tlbbatch.h new file mode 100644 index 00000000000000..fedb0b87b8db45 --- /dev/null +++ b/arch/arm64/include/asm/tlbbatch.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ARCH_ARM64_TLBBATCH_H +#define _ARCH_ARM64_TLBBATCH_H + +struct arch_tlbflush_unmap_batch { + /* + * For arm64, HW can do tlb shootdown, so we don't + * need to record cpumask for sending IPI + */ +}; + +#endif /* _ARCH_ARM64_TLBBATCH_H */ diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 412a3b9a3c25dc..3456866c6a1df7 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -254,17 +254,23 @@ static inline void flush_tlb_mm(struct mm_struct *mm) dsb(ish); } -static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, - unsigned long uaddr) +static inline void __flush_tlb_page_nosync(struct mm_struct *mm, + unsigned long uaddr) { unsigned long addr; dsb(ishst); - addr = __TLBI_VADDR(uaddr, ASID(vma->vm_mm)); + addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); } +static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, + unsigned long uaddr) +{ + return __flush_tlb_page_nosync(vma->vm_mm, uaddr); +} + static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { @@ -272,6 +278,38 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, dsb(ish); } +static inline bool arch_tlbbatch_should_defer(struct mm_struct *mm) +{ +#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI + /* + * TLB flush deferral is not required on systems which are affected by + * ARM64_WORKAROUND_REPEAT_TLBI, as __tlbi()/__tlbi_user() implementation + * will have two consecutive TLBI instructions with a dsb(ish) in between + * defeating the purpose (i.e save overall 'dsb ish' cost). + */ + if (unlikely(cpus_have_const_cap(ARM64_WORKAROUND_REPEAT_TLBI))) + return false; +#endif + return true; +} + +static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *batch, + struct mm_struct *mm, + unsigned long uaddr) +{ + __flush_tlb_page_nosync(mm, uaddr); +} + +static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) +{ + dsb(ish); +} + +static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) +{ + dsb(ish); +} + /* * This is meant to avoid soft lock-ups on large TLB flushing ranges and not * necessarily a performance improvement. From 58f341f772bb48b3d7b13dd6c0f9705ebdd02592 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 17 Jul 2023 19:36:44 +0800 Subject: [PATCH 162/489] mm/memcg: minor cleanup for mc_handle_present_pte() When pagetable lock is held, the page will always be page_mapped(). So remove unneeded page_mapped() check. Also the page can't be freed from under us in this case. So use get_page() to get extra page reference to simplify the code. No functional change intended. Link: https://lkml.kernel.org/r/20230717113644.3026478-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 93e3cc581b51d2..51772df1abc522 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5639,7 +5639,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, { struct page *page = vm_normal_page(vma, addr, ptent); - if (!page || !page_mapped(page)) + if (!page) return NULL; if (PageAnon(page)) { if (!(mc.flags & MOVE_ANON)) @@ -5648,8 +5648,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma, if (!(mc.flags & MOVE_FILE)) return NULL; } - if (!get_page_unless_zero(page)) - return NULL; + get_page(page); return page; } From 0792e47d566244e150e320708e0be708a9db1a93 Mon Sep 17 00:00:00 2001 From: Haifeng Xu Date: Mon, 17 Jul 2023 06:58:11 +0000 Subject: [PATCH 163/489] mm/mm_init.c: drop node_start_pfn from adjust_zone_range_for_zone_movable() node_start_pfn is not used in adjust_zone_range_for_zone_movable(), so it is pointless to waste a function argument. Drop the parameter. Link: https://lkml.kernel.org/r/20230717065811.1262-1-haifeng.xu@shopee.com Signed-off-by: Haifeng Xu Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 2daae1dd575553..7e3fcdbe997b08 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1105,7 +1105,6 @@ void __ref memmap_init_zone_device(struct zone *zone, */ static void __init adjust_zone_range_for_zone_movable(int nid, unsigned long zone_type, - unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zone_start_pfn, unsigned long *zone_end_pfn) @@ -1222,9 +1221,8 @@ static unsigned long __init zone_spanned_pages_in_node(int nid, /* Get the start and end of the zone */ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); - adjust_zone_range_for_zone_movable(nid, zone_type, - node_start_pfn, node_end_pfn, - zone_start_pfn, zone_end_pfn); + adjust_zone_range_for_zone_movable(nid, zone_type, node_end_pfn, + zone_start_pfn, zone_end_pfn); /* Check that this node has pages within the zone's required range */ if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn) From aee79d4e5271cee4ffa89ed830189929a6272eb8 Mon Sep 17 00:00:00 2001 From: "Zhu, Lipeng" Date: Sun, 16 Jul 2023 22:56:54 +0800 Subject: [PATCH 164/489] fs/address_space: add alignment padding for i_map and i_mmap_rwsem to mitigate a false sharing. When running UnixBench/Shell Scripts, we observed high false sharing for accessing i_mmap against i_mmap_rwsem. UnixBench/Shell Scripts are typical load/execute command test scenarios, which concurrently launch->execute->exit a lot of shell commands. A lot of processes invoke vma_interval_tree_remove which touch "i_mmap", the call stack: ----vma_interval_tree_remove |----unlink_file_vma | free_pgtables | |----exit_mmap | | mmput | | |----begin_new_exec | | | load_elf_binary | | | bprm_execve Meanwhile, there are a lot of processes touch 'i_mmap_rwsem' to acquire the semaphore in order to access 'i_mmap'. In existing 'address_space' layout, 'i_mmap' and 'i_mmap_rwsem' are in the same cacheline. The patch places the i_mmap and i_mmap_rwsem in separate cache lines to avoid this false sharing problem. With this patch, based on kernel v6.4.0, on Intel Sapphire Rapids 112C/224T platform, the score improves by ~5.3%. And perf c2c tool shows the false sharing is resolved as expected, the symbol vma_interval_tree_remove disappeared in cache line 0 after this change. Baseline: ================================================= Shared Cache Line Distribution Pareto ================================================= ------------------------------------------------------------- 0 3729 5791 0 0 0xff19b3818445c740 ------------------------------------------------------------- 3.27% 3.02% 0.00% 0.00% 0x18 0 1 0xffffffffa194403b 604 483 389 692 203 [k] vma_interval_tree_insert [kernel.kallsyms] vma_interval_tree_insert+75 0 1 4.13% 3.63% 0.00% 0.00% 0x20 0 1 0xffffffffa19440a2 553 413 415 962 215 [k] vma_interval_tree_remove [kernel.kallsyms] vma_interval_tree_remove+18 0 1 2.04% 1.35% 0.00% 0.00% 0x28 0 1 0xffffffffa219a1d6 1210 855 460 1229 222 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+678 0 1 0.62% 1.85% 0.00% 0.00% 0x28 0 1 0xffffffffa219a1bf 762 329 577 527 198 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+655 0 1 0.48% 0.31% 0.00% 0.00% 0x28 0 1 0xffffffffa219a58c 1677 1476 733 1544 224 [k] down_write [kernel.kallsyms] down_write+28 0 1 0.05% 0.07% 0.00% 0.00% 0x28 0 1 0xffffffffa219a21d 1040 819 689 33 27 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+749 0 1 0.00% 0.05% 0.00% 0.00% 0x28 0 1 0xffffffffa17707db 0 1005 786 1373 223 [k] up_write [kernel.kallsyms] up_write+27 0 1 0.00% 0.02% 0.00% 0.00% 0x28 0 1 0xffffffffa219a064 0 233 778 32 30 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+308 0 1 33.82% 34.10% 0.00% 0.00% 0x30 0 1 0xffffffffa1770945 779 495 534 6011 224 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+53 0 1 17.06% 15.28% 0.00% 0.00% 0x30 0 1 0xffffffffa1770915 593 438 468 2715 224 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+5 0 1 3.54% 3.52% 0.00% 0.00% 0x30 0 1 0xffffffffa2199f84 881 601 583 1421 223 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+84 0 1 With this change: ------------------------------------------------------------- 0 556 838 0 0 0xff2780d7965d2780 ------------------------------------------------------------- 0.18% 0.60% 0.00% 0.00% 0x8 0 1 0xffffffffafff27b8 503 453 569 14 13 [k] do_dentry_open [kernel.kallsyms] do_dentry_open+456 0 1 0.54% 0.12% 0.00% 0.00% 0x8 0 1 0xffffffffaffc51ac 510 199 428 15 12 [k] hugepage_vma_check [kernel.kallsyms] hugepage_vma_check+252 0 1 1.80% 2.15% 0.00% 0.00% 0x18 0 1 0xffffffffb079a1d6 1778 799 343 215 136 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+678 0 1 0.54% 1.31% 0.00% 0.00% 0x18 0 1 0xffffffffb079a1bf 547 296 528 91 71 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+655 0 1 0.72% 0.72% 0.00% 0.00% 0x18 0 1 0xffffffffb079a58c 1479 1534 676 288 163 [k] down_write [kernel.kallsyms] down_write+28 0 1 0.00% 0.12% 0.00% 0.00% 0x18 0 1 0xffffffffafd707db 0 2381 744 282 158 [k] up_write [kernel.kallsyms] up_write+27 0 1 0.00% 0.12% 0.00% 0.00% 0x18 0 1 0xffffffffb079a064 0 239 518 6 6 [k] rwsem_down_write_slowpath [kernel.kallsyms] rwsem_down_write_slowpath+308 0 1 46.58% 47.02% 0.00% 0.00% 0x20 0 1 0xffffffffafd70945 704 403 499 1137 219 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+53 0 1 23.92% 25.78% 0.00% 0.00% 0x20 0 1 0xffffffffafd70915 558 413 500 542 185 [k] rwsem_spin_on_owner [kernel.kallsyms] rwsem_spin_on_owner+5 0 1 v1->v2: change padding to exchange fields. Link: https://lkml.kernel.org/r/20230716145653.20122-1-lipeng.zhu@intel.com Signed-off-by: Lipeng Zhu Reviewed-by: Tim Chen Cc: Alexander Viro Cc: Christian Brauner Cc: Yu Ma Signed-off-by: Andrew Morton --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 6867512907d6e5..bb00d37a6ca020 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -447,11 +447,11 @@ struct address_space { atomic_t nr_thps; #endif struct rb_root_cached i_mmap; - struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; + struct rw_semaphore i_mmap_rwsem; errseq_t wb_err; spinlock_t private_lock; struct list_head private_list; From 4445e58264aea8ec6bb1287add79606f0e3f3988 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sat, 15 Jul 2023 17:39:20 +0300 Subject: [PATCH 165/489] maple_tree: mtree_insert*: fix typo in kernel-doc description Replace "Insert and entry at a give index" with "Insert an entry at a given index" Link: https://lkml.kernel.org/r/20230715143920.994812-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cef47ce8edddf0..616ec7f3be8123 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6274,7 +6274,7 @@ int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, EXPORT_SYMBOL(mtree_store); /** - * mtree_insert_range() - Insert an entry at a give range if there is no value. + * mtree_insert_range() - Insert an entry at a given range if there is no value. * @mt: The maple tree * @first: The start of the range * @last: The end of the range @@ -6310,7 +6310,7 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, EXPORT_SYMBOL(mtree_insert_range); /** - * mtree_insert() - Insert an entry at a give index if there is no value. + * mtree_insert() - Insert an entry at a given index if there is no value. * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store From 4ae6944d15727c50ff1c0bb3fe38b9b412520d85 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (IBM)" Date: Sat, 15 Jul 2023 11:40:38 +0300 Subject: [PATCH 166/489] maple_tree: mtree_insert: fix typo in kernel-doc description of GFP flags Replace FGP_FLAGS with GFP_FLAGS Link: https://lkml.kernel.org/r/20230715084038.987955-1-rppt@kernel.org Signed-off-by: Mike Rapoport (IBM) Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 616ec7f3be8123..b6b3973897f061 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6314,7 +6314,7 @@ EXPORT_SYMBOL(mtree_insert_range); * @mt: The maple tree * @index : The index to store the value * @entry: The entry to store - * @gfp: The FGP_FLAGS to use for allocations. + * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -EEXISTS if the range is occupied, -EINVAL on invalid * request, -ENOMEM if memory could not be allocated. From 8d3a7d797c1a18a24f602a7398103ee894c5bc3b Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 15 Jul 2023 11:51:11 +0800 Subject: [PATCH 167/489] memory tier: use helper macro __ATTR_RW() Use helper macro __ATTR_RW to define numa demotion attributes. Minor readability improvement. Link: https://lkml.kernel.org/r/20230715035111.2656784-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/memory-tiers.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index c49ab03f49b1ed..37a4f59d9585b9 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -672,16 +672,16 @@ bool numa_demotion_enabled = false; #ifdef CONFIG_MIGRATION #ifdef CONFIG_SYSFS -static ssize_t numa_demotion_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) +static ssize_t demotion_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) { return sysfs_emit(buf, "%s\n", numa_demotion_enabled ? "true" : "false"); } -static ssize_t numa_demotion_enabled_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) +static ssize_t demotion_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) { ssize_t ret; @@ -693,8 +693,7 @@ static ssize_t numa_demotion_enabled_store(struct kobject *kobj, } static struct kobj_attribute numa_demotion_enabled_attr = - __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show, - numa_demotion_enabled_store); + __ATTR_RW(demotion_enabled); static struct attribute *numa_attrs[] = { &numa_demotion_enabled_attr.attr, From cabdf74e6b319c989eb8e812f1854291ae0af1c0 Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 18 Jul 2023 15:30:19 +0800 Subject: [PATCH 168/489] mm: kfence: allocate kfence_metadata at runtime kfence_metadata is currently a static array. For the purpose of allocating scalable __kfence_pool, we first change it to runtime allocation of metadata. Since the size of an object of kfence_metadata is 1160 bytes, we can save at least 72 pages (with default 256 objects) without enabling kfence. [akpm@linux-foundation.org: restore newline, per Marco] Link: https://lkml.kernel.org/r/20230718073019.52513-1-zhangpeng.00@bytedance.com Signed-off-by: Peng Zhang Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/kfence.h | 11 ++-- mm/kfence/core.c | 123 ++++++++++++++++++++++++++++------------- mm/kfence/kfence.h | 5 +- mm/mm_init.c | 2 +- 4 files changed, 97 insertions(+), 44 deletions(-) diff --git a/include/linux/kfence.h b/include/linux/kfence.h index 726857a4b68054..401af475751413 100644 --- a/include/linux/kfence.h +++ b/include/linux/kfence.h @@ -59,15 +59,16 @@ static __always_inline bool is_kfence_address(const void *addr) } /** - * kfence_alloc_pool() - allocate the KFENCE pool via memblock + * kfence_alloc_pool_and_metadata() - allocate the KFENCE pool and KFENCE + * metadata via memblock */ -void __init kfence_alloc_pool(void); +void __init kfence_alloc_pool_and_metadata(void); /** * kfence_init() - perform KFENCE initialization at boot time * - * Requires that kfence_alloc_pool() was called before. This sets up the - * allocation gate timer, and requires that workqueues are available. + * Requires that kfence_alloc_pool_and_metadata() was called before. This sets + * up the allocation gate timer, and requires that workqueues are available. */ void __init kfence_init(void); @@ -223,7 +224,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla #else /* CONFIG_KFENCE */ static inline bool is_kfence_address(const void *addr) { return false; } -static inline void kfence_alloc_pool(void) { } +static inline void kfence_alloc_pool_and_metadata(void) { } static inline void kfence_init(void) { } static inline void kfence_shutdown_cache(struct kmem_cache *s) { } static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; } diff --git a/mm/kfence/core.c b/mm/kfence/core.c index dad3c0eb70a01d..96fd0411f5c586 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -116,7 +116,15 @@ EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */ * backing pages (in __kfence_pool). */ static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0); -struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +struct kfence_metadata *kfence_metadata __read_mostly; + +/* + * If kfence_metadata is not NULL, it may be accessed by kfence_shutdown_cache(). + * So introduce kfence_metadata_init to initialize metadata, and then make + * kfence_metadata visible after initialization is successful. This prevents + * potential UAF or access to uninitialized metadata. + */ +static struct kfence_metadata *kfence_metadata_init __read_mostly; /* Freelist with available objects. */ static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist); @@ -591,7 +599,7 @@ static unsigned long kfence_init_pool(void) __folio_set_slab(slab_folio(slab)); #ifdef CONFIG_MEMCG - slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | + slab->memcg_data = (unsigned long)&kfence_metadata_init[i / 2 - 1].objcg | MEMCG_DATA_OBJCGS; #endif } @@ -610,7 +618,7 @@ static unsigned long kfence_init_pool(void) } for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { - struct kfence_metadata *meta = &kfence_metadata[i]; + struct kfence_metadata *meta = &kfence_metadata_init[i]; /* Initialize metadata. */ INIT_LIST_HEAD(&meta->list); @@ -626,6 +634,12 @@ static unsigned long kfence_init_pool(void) addr += 2 * PAGE_SIZE; } + /* + * Make kfence_metadata visible only when initialization is successful. + * Otherwise, if the initialization fails and kfence_metadata is freed, + * it may cause UAF in kfence_shutdown_cache(). + */ + smp_store_release(&kfence_metadata, kfence_metadata_init); return 0; reset_slab: @@ -672,26 +686,10 @@ static bool __init kfence_init_pool_early(void) */ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; - return false; -} - -static bool kfence_init_pool_late(void) -{ - unsigned long addr, free_size; - addr = kfence_init_pool(); - - if (!addr) - return true; + memblock_free_late(__pa(kfence_metadata_init), KFENCE_METADATA_SIZE); + kfence_metadata_init = NULL; - /* Same as above. */ - free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); -#ifdef CONFIG_CONTIG_ALLOC - free_contig_range(page_to_pfn(virt_to_page((void *)addr)), free_size / PAGE_SIZE); -#else - free_pages_exact((void *)addr, free_size); -#endif - __kfence_pool = NULL; return false; } @@ -841,19 +839,30 @@ static void toggle_allocation_gate(struct work_struct *work) /* === Public interface ===================================================== */ -void __init kfence_alloc_pool(void) +void __init kfence_alloc_pool_and_metadata(void) { if (!kfence_sample_interval) return; - /* if the pool has already been initialized by arch, skip the below. */ - if (__kfence_pool) - return; - - __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); - + /* + * If the pool has already been initialized by arch, there is no need to + * re-allocate the memory pool. + */ if (!__kfence_pool) + __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE); + + if (!__kfence_pool) { pr_err("failed to allocate pool\n"); + return; + } + + /* The memory allocated by memblock has been zeroed out. */ + kfence_metadata_init = memblock_alloc(KFENCE_METADATA_SIZE, PAGE_SIZE); + if (!kfence_metadata_init) { + pr_err("failed to allocate metadata\n"); + memblock_free(__kfence_pool, KFENCE_POOL_SIZE); + __kfence_pool = NULL; + } } static void kfence_init_enable(void) @@ -895,33 +904,69 @@ void __init kfence_init(void) static int kfence_init_late(void) { - const unsigned long nr_pages = KFENCE_POOL_SIZE / PAGE_SIZE; + const unsigned long nr_pages_pool = KFENCE_POOL_SIZE / PAGE_SIZE; + const unsigned long nr_pages_meta = KFENCE_METADATA_SIZE / PAGE_SIZE; + unsigned long addr = (unsigned long)__kfence_pool; + unsigned long free_size = KFENCE_POOL_SIZE; + int err = -ENOMEM; + #ifdef CONFIG_CONTIG_ALLOC struct page *pages; - pages = alloc_contig_pages(nr_pages, GFP_KERNEL, first_online_node, NULL); + pages = alloc_contig_pages(nr_pages_pool, GFP_KERNEL, first_online_node, + NULL); if (!pages) return -ENOMEM; + __kfence_pool = page_to_virt(pages); + pages = alloc_contig_pages(nr_pages_meta, GFP_KERNEL, first_online_node, + NULL); + if (pages) + kfence_metadata_init = page_to_virt(pages); #else - if (nr_pages > MAX_ORDER_NR_PAGES) { + if (nr_pages_pool > MAX_ORDER_NR_PAGES || + nr_pages_meta > MAX_ORDER_NR_PAGES) { pr_warn("KFENCE_NUM_OBJECTS too large for buddy allocator\n"); return -EINVAL; } + __kfence_pool = alloc_pages_exact(KFENCE_POOL_SIZE, GFP_KERNEL); if (!__kfence_pool) return -ENOMEM; + + kfence_metadata_init = alloc_pages_exact(KFENCE_METADATA_SIZE, GFP_KERNEL); #endif - if (!kfence_init_pool_late()) { - pr_err("%s failed\n", __func__); - return -EBUSY; + if (!kfence_metadata_init) + goto free_pool; + + memzero_explicit(kfence_metadata_init, KFENCE_METADATA_SIZE); + addr = kfence_init_pool(); + if (!addr) { + kfence_init_enable(); + kfence_debugfs_init(); + return 0; } - kfence_init_enable(); - kfence_debugfs_init(); + pr_err("%s failed\n", __func__); + free_size = KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool); + err = -EBUSY; - return 0; +#ifdef CONFIG_CONTIG_ALLOC + free_contig_range(page_to_pfn(virt_to_page((void *)kfence_metadata_init)), + nr_pages_meta); +free_pool: + free_contig_range(page_to_pfn(virt_to_page((void *)addr)), + free_size / PAGE_SIZE); +#else + free_pages_exact((void *)kfence_metadata_init, KFENCE_METADATA_SIZE); +free_pool: + free_pages_exact((void *)addr, free_size); +#endif + + kfence_metadata_init = NULL; + __kfence_pool = NULL; + return err; } static int kfence_enable_late(void) @@ -941,6 +986,10 @@ void kfence_shutdown_cache(struct kmem_cache *s) struct kfence_metadata *meta; int i; + /* Pairs with release in kfence_init_pool(). */ + if (!smp_load_acquire(&kfence_metadata)) + return; + for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) { bool in_use; diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 392fb273e7bd92..f46fbb03062b9c 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -102,7 +102,10 @@ struct kfence_metadata { #endif }; -extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS]; +#define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \ + CONFIG_KFENCE_NUM_OBJECTS) + +extern struct kfence_metadata *kfence_metadata; static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 7e3fcdbe997b08..acb0ac19467255 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -2767,7 +2767,7 @@ void __init mm_core_init(void) */ page_ext_init_flatmem(); mem_debugging_and_hardening_init(); - kfence_alloc_pool(); + kfence_alloc_pool_and_metadata(); report_meminit(); kmsan_init_shadow(); stack_depot_early_init(); From 89be82b4fed258b63a201d92fca95e7c55913c23 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 17:21:36 +0800 Subject: [PATCH 169/489] mm/rmap: correct stale comment of rmap_walk_anon and rmap_walk_file 1. update page to folio in comment 2. add comment of new added @locked Link: https://lkml.kernel.org/r/20230718092136.1935789-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 5717517e404089..1355bf686fae9e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -2397,11 +2397,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, /* * rmap_walk_anon - do something to anonymous page using the object-based * rmap method - * @page: the page to be handled + * @folio: the folio to be handled * @rwc: control variable according to each walk type + * @locked: caller holds relevant rmap lock * - * Find all the mappings of a page using the mapping pointer and the vma chains - * contained in the anon_vma struct it points to. + * Find all the mappings of a folio using the mapping pointer and the vma + * chains contained in the anon_vma struct it points to. */ static void rmap_walk_anon(struct folio *folio, struct rmap_walk_control *rwc, bool locked) @@ -2445,10 +2446,11 @@ static void rmap_walk_anon(struct folio *folio, /* * rmap_walk_file - do something to file page using the object-based rmap method - * @page: the page to be handled + * @folio: the folio to be handled * @rwc: control variable according to each walk type + * @locked: caller holds relevant rmap lock * - * Find all the mappings of a page using the mapping pointer and the vma chains + * Find all the mappings of a folio using the mapping pointer and the vma chains * contained in the address_space struct it points to. */ static void rmap_walk_file(struct folio *folio, From affd26b1fbd67fceea70d9ceac40ff4815afbeb5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 19 Jul 2023 11:41:45 -0700 Subject: [PATCH 170/489] mm/hugetlb: get rid of page_hstate() Convert the last page_hstate() user to use folio_hstate() so page_hstate() can be safely removed. Link: https://lkml.kernel.org/r/20230719184145.301911-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Mike Kravetz Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ---------- mm/hugetlb.c | 6 +++--- mm/page_isolation.c | 8 ++++---- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9f4bac3df59e4b..0a393bc02f25b4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -841,11 +841,6 @@ static inline struct hstate *folio_hstate(struct folio *folio) return size_to_hstate(folio_size(folio)); } -static inline struct hstate *page_hstate(struct page *page) -{ - return folio_hstate(page_folio(page)); -} - static inline unsigned hstate_index_to_shift(unsigned index) { return hstates[index].order + PAGE_SHIFT; @@ -1062,11 +1057,6 @@ static inline struct hstate *folio_hstate(struct folio *folio) return NULL; } -static inline struct hstate *page_hstate(struct page *page) -{ - return NULL; -} - static inline struct hstate *size_to_hstate(unsigned long size) { return NULL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b076eb07a290e..412a3eec081cfe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1785,10 +1785,10 @@ static void free_hpage_workfn(struct work_struct *work) node = node->next; page->mapping = NULL; /* - * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate() - * is going to trigger because a previous call to + * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in + * folio_hstate() is going to trigger because a previous call to * remove_hugetlb_folio() will call folio_set_compound_dtor - * (folio, NULL_COMPOUND_DTOR), so do not use page_hstate() + * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate() * directly. */ h = size_to_hstate(page_size(page)); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 6599cc965e216a..bcf99ba747a05a 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -79,17 +79,17 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e * handle each tail page individually in migration. */ if (PageHuge(page) || PageTransCompound(page)) { - struct page *head = compound_head(page); + struct folio *folio = page_folio(page); unsigned int skip_pages; if (PageHuge(page)) { - if (!hugepage_migration_supported(page_hstate(head))) + if (!hugepage_migration_supported(folio_hstate(folio))) return page; - } else if (!PageLRU(head) && !__PageMovable(head)) { + } else if (!folio_test_lru(folio) && !__folio_test_movable(folio)) { return page; } - skip_pages = compound_nr(head) - (page - head); + skip_pages = folio_nr_pages(folio) - folio_page_idx(folio, page); pfn += skip_pages - 1; continue; } From 2574d5e4df32a7eed7176665a5ea08eadb704d70 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:48 -0400 Subject: [PATCH 171/489] mm/mmap: clean up validate_mm() calls Patch series "More strict maple tree lockdep", v2. Linus asked for more strict maple tree lockdep checking [1] and for them to resume the normal path through Andrews tree. This series of patches adds checks to ensure the lock is held in write mode during the write path of the maple tree instead of checking if it's held at all. It also reduces the validate_mm() calls by consolidating into commonly used functions (patch 0001), and removes the necessity of holding the lock on the detached tree during munmap() operations. This patch (of 4): validate_mm() calls are too spread out and duplicated in numerous locations. Also, now that the stack write is done under the write lock, it is not necessary to validate the mm prior to write operations. Add a validate_mm() to the stack expansions, and to vma_complete() so that numerous others may be dropped. Note that vma_link() (and also insert_vm_struct() by call path) already call validate_mm(). vma_merge() also had an unnecessary call to vma_iter_free() since the logic change to abort earlier if no merging is necessary. Drop extra validate_mm() calls at the start of functions and error paths which won't write to the tree. Relocate the validate_mm() call in the do_brk_flags() to avoid re-running the same test when vma_complete() is used. The call within the error path of mmap_region() is left intentionally because of the complexity of the function and the potential of drivers modifying the tree. Link: https://lkml.kernel.org/r/20230714195551.894800-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230714195551.894800-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- mm/mmap.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index e1586b2f938e5e..cdbeff089049af 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -594,6 +594,7 @@ static inline void vma_complete(struct vma_prepare *vp, } if (vp->insert && vp->file) uprobe_mmap(vp->insert); + validate_mm(mm); } /* @@ -676,7 +677,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; nomem: @@ -716,7 +716,6 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_end = end; vma->vm_pgoff = pgoff; vma_complete(&vp, vmi, vma->vm_mm); - validate_mm(vma->vm_mm); return 0; } @@ -889,7 +888,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, pgoff_t pglen = (end - addr) >> PAGE_SHIFT; long adj_start = 0; - validate_mm(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1016,10 +1014,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } vma_complete(&vp, vmi, mm); - vma_iter_free(vmi); - validate_mm(mm); khugepaged_enter_vma(res, vm_flags); - return res; } @@ -1194,7 +1189,6 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; - validate_mm(mm); *populate = 0; if (!len) @@ -2023,6 +2017,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); + validate_mm(mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -2113,6 +2108,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); mas_destroy(&mas); + validate_mm(mm); return error; } @@ -2290,7 +2286,6 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) remove_vma(vma, false); } vm_unacct_memory(nr_accounted); - validate_mm(mm); } /* @@ -2327,8 +2322,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct *new; int err; - validate_mm(vma->vm_mm); - WARN_ON(vma->vm_start >= addr); WARN_ON(vma->vm_end <= addr); @@ -2385,7 +2378,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Success. */ if (new_below) vma_next(vmi); - validate_mm(vma->vm_mm); return 0; out_free_mpol: @@ -2394,7 +2386,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, vma_iter_free(vmi); out_free_vma: vm_area_free(new); - validate_mm(vma->vm_mm); return err; } @@ -3045,7 +3036,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm = current->mm; struct vma_prepare vp; - validate_mm(mm); /* * Check against address space limits by the changed size * Note: This happens *after* clearing old mappings in some code paths. @@ -3097,6 +3087,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto mas_store_fail; mm->map_count++; + validate_mm(mm); ksm_add_vma(vma); out: perf_event_mmap(vma); @@ -3105,7 +3096,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vm_flags_set(vma, VM_SOFTDIRTY); - validate_mm(mm); return 0; mas_store_fail: @@ -3286,7 +3276,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, bool faulted_in_anon_vma = true; VMA_ITERATOR(vmi, mm, addr); - validate_mm(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. @@ -3345,7 +3334,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, goto out_vma_link; *need_rmap_locks = false; } - validate_mm(mm); return new_vma; out_vma_link: @@ -3361,7 +3349,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_vma: vm_area_free(new_vma); out: - validate_mm(mm); return NULL; } @@ -3498,7 +3485,6 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; - validate_mm(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM); @@ -3521,12 +3507,10 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); - validate_mm(mm); return vma; out: vm_area_free(vma); - validate_mm(mm); return ERR_PTR(ret); } From 134d153c9346fc1b2842cacec8720da3a9667a11 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:49 -0400 Subject: [PATCH 172/489] maple_tree: relax lockdep checks for on-stack trees To support early release of the maple tree locks, do not lockdep check the lock if it is set to NULL. This is intended for the special case on-stack use of tracking entries and not for general use. Link: https://lkml.kernel.org/r/20230714195551.894800-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 7769270b85e89e..6618c151288676 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -182,7 +182,9 @@ enum maple_type { #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; -#define mt_lock_is_held(mt) lock_is_held(mt->ma_external_lock) +#define mt_lock_is_held(mt) \ + (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) + #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #else From 02fdb25fb41c563a3afbd4a97469c527a6c5abbf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 5 Jul 2023 14:47:49 -0400 Subject: [PATCH 173/489] mm/mmap: change detached vma locking scheme Don't set the lock to the mm lock so that the detached VMA tree does not complain about being unlocked when the mmap_lock is dropped prior to freeing the tree. Introduce mt_on_stack() for setting the external lock to NULL only when LOCKDEP is used. Move the destroying of the detached tree outside the mmap lock all together. Link: https://lkml.kernel.org/r/20230719183142.ktgcmuj2pnlr3h3s@revolver Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 3 +++ mm/mmap.c | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 6618c151288676..e278b95984282e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -187,10 +187,13 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map + +#define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) +#define mt_on_stack(mt) do { } while (0) #endif /* diff --git a/mm/mmap.c b/mm/mmap.c index cdbeff089049af..7bd1caa09dddf0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2428,7 +2428,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long locked_vm = 0; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); - mt_set_external_lock(&mt_detach, &mm->mmap_lock); + mt_on_stack(mt_detach); /* * If we need to split any vma, do it now to save pain later. @@ -2546,11 +2546,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Statistics and freeing VMAs */ mas_set(&mas_detach, start); remove_mt(mm, &mas_detach); - __mt_destroy(&mt_detach); validate_mm(mm); if (unlock) mmap_read_unlock(mm); + __mt_destroy(&mt_detach); return 0; clear_tree_failed: From 19a462f06eb5a78e0c3ebe4fd4fbdc71620b8788 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 14 Jul 2023 15:55:51 -0400 Subject: [PATCH 174/489] maple_tree: Be more strict about locking Use lockdep to check the write path in the maple tree holds the lock in write mode. Introduce mt_write_lock_is_held() to check if the lock is held for writing. Update the necessary checks for rcu_dereference_protected() to use the new write lock check. Link: https://lkml.kernel.org/r/20230714195551.894800-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Linus Torvalds Cc: Oliver Sang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 7 ++++++- lib/maple_tree.c | 10 ++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e278b95984282e..949f911bf955c0 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -185,13 +185,18 @@ typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) +#define mt_write_lock_is_held(mt) \ + (!(mt)->ma_external_lock || \ + lock_is_held_type((mt)->ma_external_lock, 0)) + #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; -#define mt_lock_is_held(mt) 1 +#define mt_lock_is_held(mt) 1 +#define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b6b3973897f061..3b6f8c8dac6501 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -777,6 +777,12 @@ static inline void __rcu **ma_slots(struct maple_node *mn, enum maple_type mt) } } +static inline bool mt_write_locked(const struct maple_tree *mt) +{ + return mt_external_lock(mt) ? mt_write_lock_is_held(mt) : + lockdep_is_held(&mt->ma_lock); +} + static inline bool mt_locked(const struct maple_tree *mt) { return mt_external_lock(mt) ? mt_lock_is_held(mt) : @@ -792,7 +798,7 @@ static inline void *mt_slot(const struct maple_tree *mt, static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots, unsigned char offset) { - return rcu_dereference_protected(slots[offset], mt_locked(mt)); + return rcu_dereference_protected(slots[offset], mt_write_locked(mt)); } /* * mas_slot_locked() - Get the slot value when holding the maple tree lock. @@ -835,7 +841,7 @@ static inline void *mas_root(struct ma_state *mas) static inline void *mt_root_locked(struct maple_tree *mt) { - return rcu_dereference_protected(mt->ma_root, mt_locked(mt)); + return rcu_dereference_protected(mt->ma_root, mt_write_locked(mt)); } /* From 38b14e2e3de9ff77ff00642ebef46e391ccf0aaf Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:03 +1000 Subject: [PATCH 175/489] arm64/smmu: use TLBI ASID when invalidating entire range Patch series "Invalidate secondary IOMMU TLB on permission upgrade", v4. The main change is to move secondary TLB invalidation mmu notifier callbacks into the architecture specific TLB flushing functions. This makes secondary TLB invalidation mostly match CPU invalidation while still allowing efficient range based invalidations based on the existing TLB batching code. This patch (of 5): The ARM SMMU has a specific command for invalidating the TLB for an entire ASID. Currently this is used for the IO_PGTABLE API but not for ATS when called from the MMU notifier. The current implementation of notifiers does not attempt to invalidate such a large address range, instead walking each VMA and invalidating each range individually during mmap removal. However in future SMMU TLB invalidations are going to be sent as part of the normal flush_tlb_*() kernel calls. To better deal with that add handling to use TLBI ASID when invalidating the entire address space. Link: https://lkml.kernel.org/r/cover.1eca029b8603ef4eebe5b41eae51facfc5920c41.1690292440.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/ba5f0ec5fbc2ab188797524d3687e075e2412a2b.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Catalin Marinas Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Cc: Jason Gunthorpe Cc: SeongJae Park Signed-off-by: Andrew Morton --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index a5a63b1c947eb1..2a19784b698e96 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -200,10 +200,20 @@ static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn, * range. So do a simple translation here by calculating size correctly. */ size = end - start; + if (size == ULONG_MAX) + size = 0; + + if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) { + if (!size) + arm_smmu_tlb_inv_asid(smmu_domain->smmu, + smmu_mn->cd->asid); + else + arm_smmu_tlb_inv_range_asid(start, size, + smmu_mn->cd->asid, + PAGE_SIZE, false, + smmu_domain); + } - if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) - arm_smmu_tlb_inv_range_asid(start, size, smmu_mn->cd->asid, - PAGE_SIZE, false, smmu_domain); arm_smmu_atc_inv_domain(smmu_domain, mm->pasid, start, size); } From 57b037dbbadc0d8de44bf06a62ad2d9265ef31d3 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:04 +1000 Subject: [PATCH 176/489] mmu_notifiers: fixup comment in mmu_interval_read_begin() The comment in mmu_interval_read_begin() refers to a function that doesn't exist and uses the wrong call-back name. The op for mmu interval notifiers is mmu_interval_notifier_ops->invalidate() so fix the comment up to reflect that. Link: https://lkml.kernel.org/r/e7a09081b3ac82a03c189409f1262fc2df91071e.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Catalin Marinas Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- mm/mmu_notifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 50c0dde1354f45..b7ad1559c72fda 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -199,7 +199,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) * invalidate_start/end and is colliding. * * The locking looks broadly like this: - * mn_tree_invalidate_start(): mmu_interval_read_begin(): + * mn_itree_inv_start(): mmu_interval_read_begin(): * spin_lock * seq = READ_ONCE(interval_sub->invalidate_seq); * seq == subs->invalidate_seq @@ -207,7 +207,7 @@ mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub) * spin_lock * seq = ++subscriptions->invalidate_seq * spin_unlock - * op->invalidate_range(): + * op->invalidate(): * user_lock * mmu_interval_set_seq() * interval_sub->invalidate_seq = seq From 6bbd42e2df8f90b022fa0b82986d58ecb30b9dcc Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:05 +1000 Subject: [PATCH 177/489] mmu_notifiers: call invalidate_range() when invalidating TLBs The invalidate_range() is going to become an architecture specific mmu notifier used to keep the TLB of secondary MMUs such as an IOMMU in sync with the CPU page tables. Currently it is called from separate code paths to the main CPU TLB invalidations. This can lead to a secondary TLB not getting invalidated when required and makes it hard to reason about when exactly the secondary TLB is invalidated. To fix this move the notifier call to the architecture specific TLB maintenance functions for architectures that have secondary MMUs requiring explicit software invalidations. This fixes a SMMU bug on ARM64. On ARM64 PTE permission upgrades require a TLB invalidation. This invalidation is done by the architecture specific ptep_set_access_flags() which calls flush_tlb_page() if required. However this doesn't call the notifier resulting in infinite faults being generated by devices using the SMMU if it has previously cached a read-only PTE in it's TLB. Moving the invalidations into the TLB invalidation functions ensures all invalidations happen at the same time as the CPU invalidation. The architecture specific flush_tlb_all() routines do not call the notifier as none of the IOMMUs require this. Link: https://lkml.kernel.org/r/0287ae32d91393a582897d6c4db6f7456b1001f2.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Tested-by: SeongJae Park Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Tested-by: Luis Chamberlain Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 5 +++++ arch/powerpc/include/asm/book3s/64/tlbflush.h | 1 + arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 + arch/powerpc/mm/book3s64/radix_tlb.c | 4 ++++ arch/x86/include/asm/tlbflush.h | 2 ++ arch/x86/mm/tlb.c | 2 ++ include/asm-generic/tlb.h | 1 - 7 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 3456866c6a1df7..a99349d10c0ea5 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -252,6 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -263,6 +265,8 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); + mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + (uaddr & PAGE_MASK) + PAGE_SIZE); } static inline void flush_tlb_page_nosync(struct vm_area_struct *vma, @@ -396,6 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index 0d0c1447ecf045..dca0477b07093a 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -5,6 +5,7 @@ #define MMU_NO_CONTEXT ~0UL #include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index 5e3195568525bd..f3fb49fd32fe95 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,6 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); + mmu_notifier_invalidate_range(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 0bd4866d98241d..4d44902a4962fa 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,6 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1020,6 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); + mmu_notifier_invalidate_range(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1228,6 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1392,6 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); + mmu_notifier_invalidate_range(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 837e4a50281a06..0a5432364c5a55 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -3,6 +3,7 @@ #define _ASM_X86_TLBFLUSH_H #include +#include #include #include @@ -282,6 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); + mmu_notifier_invalidate_range(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 267acf27480afb..93b2f81f09dad5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -1036,6 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); + mmu_notifier_invalidate_range(mm, start, end); } diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index b46617207c9393..bc32a2284c5649 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -456,7 +456,6 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) return; tlb_flush(tlb); - mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); __tlb_reset_range(tlb); } From ec8832d007cb7b50229ad5745eec35b847cc9120 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:06 +1000 Subject: [PATCH 178/489] mmu_notifiers: don't invalidate secondary TLBs as part of mmu_notifier_invalidate_range_end() Secondary TLBs are now invalidated from the architecture specific TLB invalidation functions. Therefore there is no need to explicitly notify or invalidate as part of the range end functions. This means we can remove mmu_notifier_invalidate_range_end_only() and some of the ptep_*_notify() functions. Link: https://lkml.kernel.org/r/90d749d03cbab256ca0edeb5287069599566d783.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Catalin Marinas Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 56 ++---------------------------------- kernel/events/uprobes.c | 2 +- mm/huge_memory.c | 25 +++------------- mm/hugetlb.c | 1 - mm/memory.c | 8 ++---- mm/migrate_device.c | 9 ++---- mm/mmu_notifier.c | 25 ++-------------- mm/rmap.c | 40 -------------------------- 8 files changed, 14 insertions(+), 152 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 64a3e051c3c438..f2e9edc6aa4355 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -395,8 +395,7 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, - bool only_end); +extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool @@ -481,14 +480,7 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) might_sleep(); if (mm_has_notifiers(range->mm)) - __mmu_notifier_invalidate_range_end(range, false); -} - -static inline void -mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) -{ - if (mm_has_notifiers(range->mm)) - __mmu_notifier_invalidate_range_end(range, true); + __mmu_notifier_invalidate_range_end(range); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -582,45 +574,6 @@ static inline void mmu_notifier_range_init_owner( __young; \ }) -#define ptep_clear_flush_notify(__vma, __address, __ptep) \ -({ \ - unsigned long ___addr = __address & PAGE_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pte_t ___pte; \ - \ - ___pte = ptep_clear_flush(__vma, __address, __ptep); \ - mmu_notifier_invalidate_range(___mm, ___addr, \ - ___addr + PAGE_SIZE); \ - \ - ___pte; \ -}) - -#define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pmd_t ___pmd; \ - \ - ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ - mmu_notifier_invalidate_range(___mm, ___haddr, \ - ___haddr + HPAGE_PMD_SIZE); \ - \ - ___pmd; \ -}) - -#define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ -({ \ - unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ - struct mm_struct *___mm = (__vma)->vm_mm; \ - pud_t ___pud; \ - \ - ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ - mmu_notifier_invalidate_range(___mm, ___haddr, \ - ___haddr + HPAGE_PUD_SIZE); \ - \ - ___pud; \ -}) - /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU @@ -711,11 +664,6 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void -mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) -{ -} - static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f0ac5b87491959..3048589e2e8516 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -193,7 +193,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, } flush_cache_page(vma, addr, pte_pfn(ptep_get(pvmw.pte))); - ptep_clear_flush_notify(vma, addr, pvmw.pte); + ptep_clear_flush(vma, addr, pvmw.pte); if (new_page) set_pte_at_notify(mm, addr, pvmw.pte, mk_pte(new_page, vma->vm_page_prot)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 762be2f4244cd9..3ece117de89850 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2003,7 +2003,7 @@ static void __split_huge_pud_locked(struct vm_area_struct *vma, pud_t *pud, count_vm_event(THP_SPLIT_PUD); - pudp_huge_clear_flush_notify(vma, haddr, pud); + pudp_huge_clear_flush(vma, haddr, pud); } void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, @@ -2023,11 +2023,7 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, out: spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above pudp_huge_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2094,7 +2090,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { - old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it @@ -2304,20 +2300,7 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, out: spin_unlock(ptl); - /* - * No need to double call mmu_notifier->invalidate_range() callback. - * They are 3 cases to consider inside __split_huge_pmd_locked(): - * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious - * 2) __split_huge_zero_page_pmd() read only zero page and any write - * fault will trigger a flush_notify before pointing to a new page - * (it is fine if the secondary mmu keeps pointing to the old zero - * page in the meantime) - * 3) Split a huge pmd into pte pointing to the same page. No need - * to invalidate secondary tlb entry they are all still valid. - * any further changes to individual pte will notify. So no need - * to call mmu_notifier->invalidate_range() - */ - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 412a3eec081cfe..4672752b0b17f4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5688,7 +5688,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, /* Break COW or unshare */ huge_ptep_clear_flush(vma, haddr, ptep); - mmu_notifier_invalidate_range(mm, range.start, range.end); page_remove_rmap(&old_folio->page, vma, true); hugepage_add_new_anon_rmap(new_folio, vma, haddr); if (huge_pte_uffd_wp(pte)) diff --git a/mm/memory.c b/mm/memory.c index 44d11812a88f2e..3e16f06373765f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3155,7 +3155,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * that left a window where the new PTE could be loaded into * some TLBs while the old PTE remains in others. */ - ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + ptep_clear_flush(vma, vmf->address, vmf->pte); folio_add_new_anon_rmap(new_folio, vma, vmf->address); folio_add_lru_vma(new_folio, vma); /* @@ -3201,11 +3201,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); } - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() did already call it. - */ - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); if (new_folio) folio_put(new_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index e29626e1329e97..6c556b5876c61c 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -658,7 +658,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, if (flush) { flush_cache_page(vma, addr, pte_pfn(orig_pte)); - ptep_clear_flush_notify(vma, addr, ptep); + ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, entry); update_mmu_cache(vma, addr, ptep); } else { @@ -763,13 +763,8 @@ static void __migrate_device_pages(unsigned long *src_pfns, src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } - /* - * No need to double call mmu_notifier->invalidate_range() callback as - * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() - * did already call it. - */ if (notified) - mmu_notifier_invalidate_range_only_end(&range); + mmu_notifier_invalidate_range_end(&range); } /** diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index b7ad1559c72fda..453a156d93c013 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -551,7 +551,7 @@ int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) static void mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, - struct mmu_notifier_range *range, bool only_end) + struct mmu_notifier_range *range) { struct mmu_notifier *subscription; int id; @@ -559,24 +559,6 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { - /* - * Call invalidate_range here too to avoid the need for the - * subsystem of having to register an invalidate_range_end - * call-back when there is invalidate_range already. Usually a - * subsystem registers either invalidate_range_start()/end() or - * invalidate_range(), so this will be no additional overhead - * (besides the pointer check). - * - * We skip call to invalidate_range() if we know it is safe ie - * call site use mmu_notifier_invalidate_range_only_end() which - * is safe to do when we know that a call to invalidate_range() - * already happen under page table lock. - */ - if (!only_end && subscription->ops->invalidate_range) - subscription->ops->invalidate_range(subscription, - range->mm, - range->start, - range->end); if (subscription->ops->invalidate_range_end) { if (!mmu_notifier_range_blockable(range)) non_block_start(); @@ -589,8 +571,7 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, - bool only_end) +void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { struct mmu_notifier_subscriptions *subscriptions = range->mm->notifier_subscriptions; @@ -600,7 +581,7 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range, mn_itree_inv_end(subscriptions); if (!hlist_empty(&subscriptions->list)) - mn_hlist_invalidate_end(subscriptions, range, only_end); + mn_hlist_invalidate_end(subscriptions, range); lock_map_release(&__mmu_notifier_invalidate_range_start_map); } diff --git a/mm/rmap.c b/mm/rmap.c index 1355bf686fae9e..51ec8aa5e61f2d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -985,13 +985,6 @@ static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw) #endif } - /* - * No need to call mmu_notifier_invalidate_range() as we are - * downgrading page table protection not changing it to point - * to a new page. - * - * See Documentation/mm/mmu_notifier.rst - */ if (ret) cleaned++; } @@ -1549,8 +1542,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, - range.start, range.end); /* * The ref count of the PMD page was * dropped which is part of the way map @@ -1623,9 +1614,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); - /* We have to invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } else if (folio_test_anon(folio)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1637,9 +1625,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, folio_test_swapcache(folio))) { WARN_ON_ONCE(1); ret = false; - /* We have to invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); page_vma_mapped_walk_done(&pvmw); break; } @@ -1670,9 +1655,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ if (ref_count == 1 + map_count && !folio_test_dirty(folio)) { - /* Invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, - address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1727,9 +1709,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); - /* Invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } else { /* * This is a locked file-backed folio, @@ -1745,13 +1724,6 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, dec_mm_counter(mm, mm_counter_file(&folio->page)); } discard: - /* - * No need to call mmu_notifier_invalidate_range() it has be - * done above for all cases requiring it to happen under page - * table lock before mmu_notifier_invalidate_range_end() - * - * See Documentation/mm/mmu_notifier.rst - */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); @@ -1930,8 +1902,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, hugetlb_vma_unlock_write(vma); flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, - range.start, range.end); /* * The ref count of the PMD page was @@ -2036,9 +2006,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * copied pages. */ dec_mm_counter(mm, mm_counter(&folio->page)); - /* We have to invalidate as we cleared the pte */ - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } else { swp_entry_t entry; pte_t swp_pte; @@ -2102,13 +2069,6 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, */ } - /* - * No need to call mmu_notifier_invalidate_range() it has be - * done above for all cases requiring it to happen under page - * table lock before mmu_notifier_invalidate_range_end() - * - * See Documentation/mm/mmu_notifier.rst - */ page_remove_rmap(subpage, vma, folio_test_hugetlb(folio)); if (vma->vm_flags & VM_LOCKED) mlock_drain_local(); From 1af5a8109904b7f00828e7f9f63f5695b42f8215 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Tue, 25 Jul 2023 23:42:07 +1000 Subject: [PATCH 179/489] mmu_notifiers: rename invalidate_range notifier There are two main use cases for mmu notifiers. One is by KVM which uses mmu_notifier_invalidate_range_start()/end() to manage a software TLB. The other is to manage hardware TLBs which need to use the invalidate_range() callback because HW can establish new TLB entries at any time. Hence using start/end() can lead to memory corruption as these callbacks happen too soon/late during page unmap. mmu notifier users should therefore either use the start()/end() callbacks or the invalidate_range() callbacks. To make this usage clearer rename the invalidate_range() callback to arch_invalidate_secondary_tlbs() and update documention. Link: https://lkml.kernel.org/r/6f77248cd25545c8020a54b4e567e8b72be4dca1.1690292440.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Suggested-by: Jason Gunthorpe Acked-by: Catalin Marinas Reviewed-by: Jason Gunthorpe Cc: Andrew Donnellan Cc: Chaitanya Kumar Borah Cc: Frederic Barrat Cc: Jason Gunthorpe Cc: John Hubbard Cc: Kevin Tian Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Nicolin Chen Cc: Robin Murphy Cc: Sean Christopherson Cc: SeongJae Park Cc: Tvrtko Ursulin Cc: Will Deacon Cc: Zhi Wang Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 6 +-- arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 2 +- arch/powerpc/mm/book3s64/radix_tlb.c | 8 ++-- arch/x86/include/asm/tlbflush.h | 2 +- arch/x86/mm/tlb.c | 2 +- drivers/iommu/amd/iommu_v2.c | 10 ++-- .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 13 ++--- drivers/iommu/intel/svm.c | 8 ++-- drivers/misc/ocxl/link.c | 8 ++-- include/linux/mmu_notifier.h | 48 +++++++++---------- mm/huge_memory.c | 4 +- mm/hugetlb.c | 7 +-- mm/mmu_notifier.c | 21 ++++++-- 13 files changed, 76 insertions(+), 63 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index a99349d10c0ea5..84a05a0bd2b67a 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -253,7 +253,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) __tlbi(aside1is, asid); __tlbi_user(aside1is, asid); dsb(ish); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void __flush_tlb_page_nosync(struct mm_struct *mm, @@ -265,7 +265,7 @@ static inline void __flush_tlb_page_nosync(struct mm_struct *mm, addr = __TLBI_VADDR(uaddr, ASID(mm)); __tlbi(vale1is, addr); __tlbi_user(vale1is, addr); - mmu_notifier_invalidate_range(mm, uaddr & PAGE_MASK, + mmu_notifier_arch_invalidate_secondary_tlbs(mm, uaddr & PAGE_MASK, (uaddr & PAGE_MASK) + PAGE_SIZE); } @@ -400,7 +400,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, scale++; } dsb(ish); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } static inline void flush_tlb_range(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c index f3fb49fd32fe95..17075c78d4bc3d 100644 --- a/arch/powerpc/mm/book3s64/radix_hugetlbpage.c +++ b/arch/powerpc/mm/book3s64/radix_hugetlbpage.c @@ -39,7 +39,7 @@ void radix__flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long st radix__flush_tlb_pwc_range_psize(vma->vm_mm, start, end, psize); else radix__flush_tlb_range_psize(vma->vm_mm, start, end, psize); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end); } void radix__huge_ptep_modify_prot_commit(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 4d44902a4962fa..06e647ef19d1d8 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -987,7 +987,7 @@ void radix__flush_tlb_mm(struct mm_struct *mm) } } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } EXPORT_SYMBOL(radix__flush_tlb_mm); @@ -1021,7 +1021,7 @@ static void __flush_all_mm(struct mm_struct *mm, bool fullmm) _tlbiel_pid_multicast(mm, pid, RIC_FLUSH_ALL); } preempt_enable(); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } void radix__flush_all_mm(struct mm_struct *mm) @@ -1230,7 +1230,7 @@ static inline void __radix__flush_tlb_range(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, @@ -1395,7 +1395,7 @@ static void __radix__flush_tlb_range_psize(struct mm_struct *mm, } out: preempt_enable(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start, diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 0a5432364c5a55..6ab42caaa67a36 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -283,7 +283,7 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b { inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); - mmu_notifier_invalidate_range(mm, 0, -1UL); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, 0, -1UL); } static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 93b2f81f09dad5..2d253919b3e8ad 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1037,7 +1037,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_flush_tlb_info(); put_cpu(); - mmu_notifier_invalidate_range(mm, start, end); + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } diff --git a/drivers/iommu/amd/iommu_v2.c b/drivers/iommu/amd/iommu_v2.c index 261352a2327162..2596466cd5a60a 100644 --- a/drivers/iommu/amd/iommu_v2.c +++ b/drivers/iommu/amd/iommu_v2.c @@ -355,9 +355,9 @@ static struct pasid_state *mn_to_state(struct mmu_notifier *mn) return container_of(mn, struct pasid_state, mn); } -static void mn_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void mn_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pasid_state *pasid_state; struct device_state *dev_state; @@ -391,8 +391,8 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { - .release = mn_release, - .invalidate_range = mn_invalidate_range, + .release = mn_release, + .arch_invalidate_secondary_tlbs = mn_arch_invalidate_secondary_tlbs, }; static void set_pri_tag_status(struct pasid_state *pasid_state, diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c index 2a19784b698e96..dbc812a0e57e97 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c @@ -186,9 +186,10 @@ static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd) } } -static void arm_smmu_mm_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) { struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn); struct arm_smmu_domain *smmu_domain = smmu_mn->domain; @@ -247,9 +248,9 @@ static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn) } static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = { - .invalidate_range = arm_smmu_mm_invalidate_range, - .release = arm_smmu_mm_release, - .free_notifier = arm_smmu_mmu_notifier_free, + .arch_invalidate_secondary_tlbs = arm_smmu_mm_arch_invalidate_secondary_tlbs, + .release = arm_smmu_mm_release, + .free_notifier = arm_smmu_mmu_notifier_free, }; /* Allocate or get existing MMU notifier for this {domain, mm} pair */ diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index e95b339e9cdc08..8f6d68006ab6a9 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -219,9 +219,9 @@ static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, } /* Pages have been freed at this point */ -static void intel_invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void intel_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct intel_svm *svm = container_of(mn, struct intel_svm, notifier); @@ -256,7 +256,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops intel_mmuops = { .release = intel_mm_release, - .invalidate_range = intel_invalidate_range, + .arch_invalidate_secondary_tlbs = intel_arch_invalidate_secondary_tlbs, }; static DEFINE_MUTEX(pasid_mutex); diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c index 4cf4c55a5f0063..c06c699c0e7b1d 100644 --- a/drivers/misc/ocxl/link.c +++ b/drivers/misc/ocxl/link.c @@ -491,9 +491,9 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle) } EXPORT_SYMBOL_GPL(ocxl_link_release); -static void invalidate_range(struct mmu_notifier *mn, - struct mm_struct *mm, - unsigned long start, unsigned long end) +static void arch_invalidate_secondary_tlbs(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) { struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier); struct ocxl_link *link = pe_data->link; @@ -509,7 +509,7 @@ static void invalidate_range(struct mmu_notifier *mn, } static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = { - .invalidate_range = invalidate_range, + .arch_invalidate_secondary_tlbs = arch_invalidate_secondary_tlbs, }; static u64 calculate_cfg_state(bool kernel) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index f2e9edc6aa4355..6e3c857606f19c 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -187,27 +187,27 @@ struct mmu_notifier_ops { const struct mmu_notifier_range *range); /* - * invalidate_range() is either called between - * invalidate_range_start() and invalidate_range_end() when the - * VM has to free pages that where unmapped, but before the - * pages are actually freed, or outside of _start()/_end() when - * a (remote) TLB is necessary. + * arch_invalidate_secondary_tlbs() is used to manage a non-CPU TLB + * which shares page-tables with the CPU. The + * invalidate_range_start()/end() callbacks should not be implemented as + * invalidate_secondary_tlbs() already catches the points in time when + * an external TLB needs to be flushed. * - * If invalidate_range() is used to manage a non-CPU TLB with - * shared page-tables, it not necessary to implement the - * invalidate_range_start()/end() notifiers, as - * invalidate_range() already catches the points in time when an - * external TLB range needs to be flushed. For more in depth - * discussion on this see Documentation/mm/mmu_notifier.rst + * This requires arch_invalidate_secondary_tlbs() to be called while + * holding the ptl spin-lock and therefore this callback is not allowed + * to sleep. * - * Note that this function might be called with just a sub-range - * of what was passed to invalidate_range_start()/end(), if - * called between those functions. + * This is called by architecture code whenever invalidating a TLB + * entry. It is assumed that any secondary TLB has the same rules for + * when invalidations are required. If this is not the case architecture + * code will need to call this explicitly when required for secondary + * TLB invalidation. */ - void (*invalidate_range)(struct mmu_notifier *subscription, - struct mm_struct *mm, - unsigned long start, - unsigned long end); + void (*arch_invalidate_secondary_tlbs)( + struct mmu_notifier *subscription, + struct mm_struct *mm, + unsigned long start, + unsigned long end); /* * These callbacks are used with the get/put interface to manage the @@ -396,8 +396,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r); -extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); @@ -483,11 +483,11 @@ mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) __mmu_notifier_invalidate_range_end(range); } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range(mm, start, end); + __mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) @@ -664,7 +664,7 @@ void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } -static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, +static inline void mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, unsigned long start, unsigned long end) { } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3ece117de89850..e0420de0e2e093 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2120,8 +2120,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling - * mmu_notifier_invalidate_range() see comments below inside - * __split_huge_pmd() ? + * mmu_notifier_arch_invalidate_secondary_tlbs() see comments below + * inside __split_huge_pmd() ? * * We are going from a zero huge page write protected to zero * small page also write protected so it does not seems useful diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4672752b0b17f4..5ef7bccda50ccf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6649,8 +6649,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, else flush_hugetlb_tlb_range(vma, start, end); /* - * No need to call mmu_notifier_invalidate_range() we are downgrading - * page table protection not changing it to point to a new page. + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs() we are + * downgrading page table protection not changing it to point to a new + * page. * * See Documentation/mm/mmu_notifier.rst */ @@ -7294,7 +7295,7 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma, i_mmap_unlock_write(vma->vm_file->f_mapping); hugetlb_vma_unlock_write(vma); /* - * No need to call mmu_notifier_invalidate_range(), see + * No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see * Documentation/mm/mmu_notifier.rst. */ mmu_notifier_invalidate_range_end(&range); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 453a156d93c013..ec3b068cbbe6b1 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -585,8 +585,8 @@ void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) lock_map_release(&__mmu_notifier_invalidate_range_start_map); } -void __mmu_notifier_invalidate_range(struct mm_struct *mm, - unsigned long start, unsigned long end) +void __mmu_notifier_arch_invalidate_secondary_tlbs(struct mm_struct *mm, + unsigned long start, unsigned long end) { struct mmu_notifier *subscription; int id; @@ -595,9 +595,10 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, hlist_for_each_entry_rcu(subscription, &mm->notifier_subscriptions->list, hlist, srcu_read_lock_held(&srcu)) { - if (subscription->ops->invalidate_range) - subscription->ops->invalidate_range(subscription, mm, - start, end); + if (subscription->ops->arch_invalidate_secondary_tlbs) + subscription->ops->arch_invalidate_secondary_tlbs( + subscription, mm, + start, end); } srcu_read_unlock(&srcu, id); } @@ -616,6 +617,16 @@ int __mmu_notifier_register(struct mmu_notifier *subscription, mmap_assert_write_locked(mm); BUG_ON(atomic_read(&mm->mm_users) <= 0); + /* + * Subsystems should only register for invalidate_secondary_tlbs() or + * invalidate_range_start()/end() callbacks, not both. + */ + if (WARN_ON_ONCE(subscription && + (subscription->ops->arch_invalidate_secondary_tlbs && + (subscription->ops->invalidate_range_start || + subscription->ops->invalidate_range_end)))) + return -EINVAL; + if (!mm->notifier_subscriptions) { /* * kmalloc cannot be called under mm_take_all_locks(), but we From ea09800bf17561a0d20498923d766468c0d7a4a7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 20 Jul 2023 19:28:06 +0800 Subject: [PATCH 180/489] mm: fix obsolete function name above debug_pagealloc_enabled_static() Since commit 04013513cc84 ("mm, page_alloc: do not rely on the order of page_poison and init_on_alloc/free parameters"), init_debug_pagealloc() is converted to init_mem_debugging_and_hardening(). Later it's renamed to mem_debugging_and_hardening_init() via commit f2fc4b44ec2b ("mm: move init_mem_debugging_and_hardening() to mm/mm_init.c"). Link: https://lkml.kernel.org/r/20230720112806.3851893-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ae5654f665b7a..84988d4ff8fb2b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3465,8 +3465,8 @@ static inline bool debug_pagealloc_enabled(void) } /* - * For use in fast paths after init_debug_pagealloc() has run, or when a - * false negative result is not harmful when called too early. + * For use in fast paths after mem_debugging_and_hardening_init() has run, + * or when a false negative result is not harmful when called too early. */ static inline bool debug_pagealloc_enabled_static(void) { From 58e2847ad2e6322a25dedf8b4549ff924baf8395 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:15 +0100 Subject: [PATCH 181/489] selftests: line buffer test program's stdout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "selftests/mm fixes for arm64", v3. Given my on-going work on large anon folios and contpte mappings, I decided it would be a good idea to start running mm selftests to help guard against regressions. However, it soon became clear that I couldn't get the suite to run cleanly on arm64 with a vanilla v6.5-rc1 kernel (perhaps I'm just doing it wrong??), so got stuck in a rabbit hole trying to debug and fix all the issues. Some were down to misconfigurations, but I also found a number of issues with the tests and even a couple of issues with the kernel. This patch (of 8): The selftests runner pipes the test program's stdout to tap_prefix. The presence of the pipe means that the test program sets its stdout to be fully buffered (as aposed to line buffered when directly connected to the terminal). The block buffering means that there is often content in the buffer at fork() time, which causes the output to end up duplicated. This was causing problems for mm:cow where test results were duplicated 20-30x. Solve this by using `stdbuf`, when available to force the test program to use line buffered mode. This means previously printf'ed results are flushed out of the program before any fork(). Additionally, explicitly set line buffer mode in ksft_print_header(), which means that all test programs that use the ksft framework will benefit even if stdbuf is not present on the system. [ryan.roberts@arm.com: add setvbuf() to set buffering mode] Link: https://lkml.kernel.org/r/20230726070655.2713530-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20230724082522.1202616-1-ryan.roberts@arm.com Link: https://lkml.kernel.org/r/20230724082522.1202616-2-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: Mark Brown Cc: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/include/nolibc/stdio.h | 24 +++++++++++++++++++++ tools/testing/selftests/kselftest.h | 9 ++++++++ tools/testing/selftests/kselftest/runner.sh | 7 ++++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/tools/include/nolibc/stdio.h b/tools/include/nolibc/stdio.h index 0eef91daf2898b..a3778aff4fa926 100644 --- a/tools/include/nolibc/stdio.h +++ b/tools/include/nolibc/stdio.h @@ -21,6 +21,11 @@ #define EOF (-1) #endif +/* Buffering mode used by setvbuf. */ +#define _IOFBF 0 /* Fully buffered. */ +#define _IOLBF 1 /* Line buffered. */ +#define _IONBF 2 /* No buffering. */ + /* just define FILE as a non-empty type. The value of the pointer gives * the FD: FILE=~fd for fd>=0 or NULL for fd<0. This way positive FILE * are immediately identified as abnormal entries (i.e. possible copies @@ -350,6 +355,25 @@ void perror(const char *msg) fprintf(stderr, "%s%serrno=%d\n", (msg && *msg) ? msg : "", (msg && *msg) ? ": " : "", errno); } +static __attribute__((unused)) +int setvbuf(FILE *stream, char *buf, int mode, size_t size) +{ + /* + * nolibc does not support buffering so this is a nop. Just check mode + * is valid as required by the spec. + */ + switch (mode) { + case _IOFBF: + case _IOLBF: + case _IONBF: + break; + default: + return EOF; + } + + return 0; +} + /* make sure to include all global symbols */ #include "nolibc.h" diff --git a/tools/testing/selftests/kselftest.h b/tools/testing/selftests/kselftest.h index 829be379545ada..529d29a359002c 100644 --- a/tools/testing/selftests/kselftest.h +++ b/tools/testing/selftests/kselftest.h @@ -113,6 +113,15 @@ static inline int ksft_get_error_cnt(void) { return ksft_cnt.ksft_error; } static inline void ksft_print_header(void) { + /* + * Force line buffering; If stdout is not connected to a terminal, it + * will otherwise default to fully buffered, which can cause output + * duplication if there is content in the buffer when fork()ing. If + * there is a crash, line buffering also means the most recent output + * line will be visible. + */ + setvbuf(stdout, NULL, _IOLBF, 0); + if (!(getenv("KSFT_TAP_LEVEL"))) printf("TAP version 13\n"); } diff --git a/tools/testing/selftests/kselftest/runner.sh b/tools/testing/selftests/kselftest/runner.sh index 1c952d1401d46b..261c73cab41b1d 100644 --- a/tools/testing/selftests/kselftest/runner.sh +++ b/tools/testing/selftests/kselftest/runner.sh @@ -105,15 +105,18 @@ run_one() echo "# Warning: file $TEST is missing!" echo "not ok $test_num $TEST_HDR_MSG" else + if [ -x /usr/bin/stdbuf ]; then + stdbuf="/usr/bin/stdbuf --output=L " + fi eval kselftest_cmd_args="\$${kselftest_cmd_args_ref:-}" - cmd="./$BASENAME_TEST $kselftest_cmd_args" + cmd="$stdbuf ./$BASENAME_TEST $kselftest_cmd_args" if [ ! -x "$TEST" ]; then echo "# Warning: file $TEST is not executable" if [ $(head -n 1 "$TEST" | cut -c -2) = "#!" ] then interpreter=$(head -n 1 "$TEST" | cut -c 3-) - cmd="$interpreter ./$BASENAME_TEST" + cmd="$stdbuf $interpreter ./$BASENAME_TEST" else echo "not ok $test_num $TEST_HDR_MSG" return From f6dd4e223d8798319d0e2815a468b9fb0a276446 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:16 +0100 Subject: [PATCH 182/489] selftests/mm: skip soft-dirty tests on arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit arm64 does not support the soft-dirty PTE bit. However, the `soft-dirty` test suite is currently run unconditionally and therefore generates spurious test failures on arm64. There are also some tests in `madv_populate` which assume it is supported. For `soft-dirty` lets disable the whole suite for arm64; it is no longer built and run_vmtests.sh will skip it if its not present. For `madv_populate`, we need a runtime mechanism so that the remaining tests continue to be run. Unfortunately, the only way to determine if the soft-dirty dirty bit is supported is to write to a page, then see if the bit is set in /proc/self/pagemap. But the tests that we want to conditionally execute are testing precicesly this. So if we introduced this feature check, we could accedentally turn a real failure (on a system that claims to support soft-dirty) into a skip. So instead, do the check based on architecture; for arm64, we report that soft-dirty is not supported. Link: https://lkml.kernel.org/r/20230724082522.1202616-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 5 ++++- tools/testing/selftests/mm/madv_populate.c | 26 ++++++++++++++++++++-- tools/testing/selftests/mm/run_vmtests.sh | 5 ++++- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index b7fce9073279ef..f35977a2bbf3fe 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -64,12 +64,15 @@ TEST_GEN_PROGS += thuge-gen TEST_GEN_PROGS += transhuge-stress TEST_GEN_PROGS += uffd-stress TEST_GEN_PROGS += uffd-unit-tests -TEST_GEN_PROGS += soft-dirty TEST_GEN_PROGS += split_huge_page_test TEST_GEN_PROGS += ksm_tests TEST_GEN_PROGS += ksm_functional_tests TEST_GEN_PROGS += mdwe_test +ifneq ($(ARCH),arm64) +TEST_GEN_PROGS += soft-dirty +endif + ifeq ($(ARCH),x86_64) CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) diff --git a/tools/testing/selftests/mm/madv_populate.c b/tools/testing/selftests/mm/madv_populate.c index 60547245e479b1..17bcb07f19f349 100644 --- a/tools/testing/selftests/mm/madv_populate.c +++ b/tools/testing/selftests/mm/madv_populate.c @@ -264,14 +264,35 @@ static void test_softdirty(void) munmap(addr, SIZE); } +static int system_has_softdirty(void) +{ + /* + * There is no way to check if the kernel supports soft-dirty, other + * than by writing to a page and seeing if the bit was set. But the + * tests are intended to check that the bit gets set when it should, so + * doing that check would turn a potentially legitimate fail into a + * skip. Fortunately, we know for sure that arm64 does not support + * soft-dirty. So for now, let's just use the arch as a corse guide. + */ +#if defined(__aarch64__) + return 0; +#else + return 1; +#endif +} + int main(int argc, char **argv) { + int nr_tests = 16; int err; pagesize = getpagesize(); + if (system_has_softdirty()) + nr_tests += 5; + ksft_print_header(); - ksft_set_plan(21); + ksft_set_plan(nr_tests); sense_support(); test_prot_read(); @@ -279,7 +300,8 @@ int main(int argc, char **argv) test_holes(); test_populate_read(); test_populate_write(); - test_softdirty(); + if (system_has_softdirty()) + test_softdirty(); err = ksft_get_fail_cnt(); if (err) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 9666c0c171ab86..4eaefc3d2e59ad 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -322,7 +322,10 @@ then CATEGORY="pkey" run_test ./protection_keys_64 fi -CATEGORY="soft_dirty" run_test ./soft-dirty +if [ -x ./soft-dirty ] +then + CATEGORY="soft_dirty" run_test ./soft-dirty +fi # COW tests CATEGORY="cow" run_test ./cow From e515bce98deb06b392ca79b5c342a849372be7ae Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:17 +0100 Subject: [PATCH 183/489] selftests/mm: enable mrelease_test for arm64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mrelease_test defaults to defining __NR_pidfd_open and __NR_process_mrelease syscall numbers to -1, if they are not defined anywhere else, and the suite would then be marked as skipped as a result. arm64 (at least the stock debian toolchain that I'm using) requires including to pull in the defines for these syscalls. So let's add this header. With this in place, the test is passing on arm64. Link: https://lkml.kernel.org/r/20230724082522.1202616-4-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mrelease_test.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/mm/mrelease_test.c b/tools/testing/selftests/mm/mrelease_test.c index dca21042b67927..d822004a374e9d 100644 --- a/tools/testing/selftests/mm/mrelease_test.c +++ b/tools/testing/selftests/mm/mrelease_test.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include From 6e16f5133501440699dcca3c5aba367cf6f9c227 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:18 +0100 Subject: [PATCH 184/489] selftests/mm: fix thuge-gen test bugs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit thuge-gen was previously only munmapping part of the mmapped buffer, which caused us to run out of 1G huge pages for a later part of the test. Fix this by munmapping the whole buffer. Based on the code, it looks like a typo rather than an intention to keep some of the buffer mapped. thuge-gen was also calling mmap with SHM_HUGETLB flag (bit 11 set), which is actually MAP_DENYWRITE in mmap context. The man page says this flag is ignored in modern kernels. I'm pretty sure from the context that the author intended to pass the MAP_HUGETLB flag so I've fixed that up too. Link: https://lkml.kernel.org/r/20230724082522.1202616-5-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/thuge-gen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/thuge-gen.c b/tools/testing/selftests/mm/thuge-gen.c index 380ab5f0a5340e..16ed4dfa735983 100644 --- a/tools/testing/selftests/mm/thuge-gen.c +++ b/tools/testing/selftests/mm/thuge-gen.c @@ -139,7 +139,7 @@ void test_mmap(unsigned long size, unsigned flags) before, after, before - after, size); assert(size == getpagesize() || (before - after) == NUM_PAGES); show(size); - err = munmap(map, size); + err = munmap(map, size * NUM_PAGES); assert(!err); } @@ -222,7 +222,7 @@ int main(void) test_mmap(ps, MAP_HUGETLB | arg); } printf("Testing default huge mmap\n"); - test_mmap(default_hps, SHM_HUGETLB); + test_mmap(default_hps, MAP_HUGETLB); puts("Testing non-huge shmget"); test_shmget(getpagesize(), 0); From 49f09526b16361db6af387726082ffdda796c101 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:19 +0100 Subject: [PATCH 185/489] selftests/mm: va_high_addr_switch should skip unsupported arm64 configs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit va_high_addr_switch has a mechanism to determine if the tests should be run or skipped (supported_arch()). This currently returns unconditionally true for arm64. However, va_high_addr_switch also requires a large virtual address space for the tests to run, otherwise they spuriously fail. Since arm64 can only support VA > 48 bits when the page size is 64K, let's decide whether we should skip the test suite based on the page size. This reduces noise when running on 4K and 16K kernels. Link: https://lkml.kernel.org/r/20230724082522.1202616-6-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/va_high_addr_switch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/va_high_addr_switch.c b/tools/testing/selftests/mm/va_high_addr_switch.c index 7cfaf4a74c571b..cfbc501290d3b6 100644 --- a/tools/testing/selftests/mm/va_high_addr_switch.c +++ b/tools/testing/selftests/mm/va_high_addr_switch.c @@ -292,7 +292,7 @@ static int supported_arch(void) #elif defined(__x86_64__) return 1; #elif defined(__aarch64__) - return 1; + return getpagesize() == PAGE_SIZE; #else return 0; #endif From 000303329752ffff08a6c120d21b8dc382bc9575 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:20 +0100 Subject: [PATCH 186/489] selftests/mm: make migration test robust to failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `migration` test currently has a number of robustness problems that cause it to hang and leak resources. Timeout: There are 3 tests, which each previously ran for 60 seconds. However, the timeout in mm/settings for a single test binary was set to 45 seconds. So when run using run_kselftest.sh, the top level timeout would trigger before the test binary was finished. Solve this by meeting in the middle; each of the 3 tests now runs for 20 seconds (for a total of 60), and the top level timeout is set to 90 seconds. Leaking child processes: the `shared_anon` test fork()s some children but then an ASSERT() fires before the test kills those children. The assert causes immediate exit of the parent and leaking of the children. Furthermore, if run using the run_kselftest.sh wrapper, the wrapper would get stuck waiting for those children to exit, which never happens. Solve this by setting the "parent death signal" to SIGHUP in the child, so that the child is killed automatically if the parent dies. With these changes, the test binary now runs to completion on arm64, with 2 tests passing and the `shared_anon` test failing. Link: https://lkml.kernel.org/r/20230724082522.1202616-7-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Reviewed-by: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/migration.c | 12 +++++++++--- tools/testing/selftests/mm/settings | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/migration.c b/tools/testing/selftests/mm/migration.c index 379581567f272f..6908569ef4065d 100644 --- a/tools/testing/selftests/mm/migration.c +++ b/tools/testing/selftests/mm/migration.c @@ -10,12 +10,13 @@ #include #include #include +#include #include #include #include #define TWOMEG (2<<20) -#define RUNTIME (60) +#define RUNTIME (20) #define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) @@ -155,10 +156,15 @@ TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) memset(ptr, 0xde, TWOMEG); for (i = 0; i < self->nthreads - 1; i++) { pid = fork(); - if (!pid) + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGHUP); + /* Parent may have died before prctl so check now. */ + if (getppid() == 1) + kill(getpid(), SIGHUP); access_mem(ptr); - else + } else { self->pids[i] = pid; + } } ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings index 9abfc60e9e6f49..ba4d85f74cd6b9 100644 --- a/tools/testing/selftests/mm/settings +++ b/tools/testing/selftests/mm/settings @@ -1 +1 @@ -timeout=45 +timeout=90 From e170621027161a8cd00132a55fe92b41e37a9203 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:21 +0100 Subject: [PATCH 187/489] selftests/mm: optionally pass duration to transhuge-stress MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Until now, transhuge-stress runs until its explicitly killed, so when invoked by run_kselftest.sh, it would run until the test timeout, then it would be killed and the test would be marked as failed. Add a new, optional command line parameter that allows the user to specify the duration in seconds that the program should run. The program exits after this duration with a success (0) exit code. If the argument is omitted the old behacvior remains. On it's own, this doesn't quite solve our problem because run_kselftest.sh does not allow passing parameters to the program under test. But we will shortly move this to run_vmtests.sh, which does allow parameter passing. Link: https://lkml.kernel.org/r/20230724082522.1202616-8-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/transhuge-stress.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/transhuge-stress.c b/tools/testing/selftests/mm/transhuge-stress.c index ba9d37ad3a8957..c61fb9350b8c21 100644 --- a/tools/testing/selftests/mm/transhuge-stress.c +++ b/tools/testing/selftests/mm/transhuge-stress.c @@ -25,13 +25,14 @@ int main(int argc, char **argv) { size_t ram, len; void *ptr, *p; - struct timespec a, b; + struct timespec start, a, b; int i = 0; char *name = NULL; double s; uint8_t *map; size_t map_len; int pagemap_fd; + int duration = 0; ram = sysconf(_SC_PHYS_PAGES); if (ram > SIZE_MAX / psize() / 4) @@ -42,9 +43,11 @@ int main(int argc, char **argv) while (++i < argc) { if (!strcmp(argv[i], "-h")) - errx(1, "usage: %s [size in MiB]", argv[0]); + errx(1, "usage: %s [-f ] [-d ] [size in MiB]", argv[0]); else if (!strcmp(argv[i], "-f")) name = argv[++i]; + else if (!strcmp(argv[i], "-d")) + duration = atoi(argv[++i]); else len = atoll(argv[i]) << 20; } @@ -78,6 +81,8 @@ int main(int argc, char **argv) if (!map) errx(2, "map malloc"); + clock_gettime(CLOCK_MONOTONIC, &start); + while (1) { int nr_succeed = 0, nr_failed = 0, nr_pages = 0; @@ -118,5 +123,8 @@ int main(int argc, char **argv) "%4d succeed, %4d failed, %4d different pages", s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), nr_succeed, nr_failed, nr_pages); + + if (duration > 0 && b.tv_sec - start.tv_sec >= duration) + return 0; } } From 05f1edac80095634638a2ba8ae8d0f3d2b3e653b Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 24 Jul 2023 09:25:22 +0100 Subject: [PATCH 188/489] selftests/mm: run all tests from run_vmtests.sh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is very unclear to me how one is supposed to run all the mm selftests consistently and get clear results. Most of the test programs are launched by both run_vmtests.sh and run_kselftest.sh: hugepage-mmap hugepage-shm map_hugetlb hugepage-mremap hugepage-vmemmap hugetlb-madvise map_fixed_noreplace gup_test gup_longterm uffd-unit-tests uffd-stress compaction_test on-fault-limit map_populate mlock-random-test mlock2-tests mrelease_test mremap_test thuge-gen virtual_address_range va_high_addr_switch mremap_dontunmap hmm-tests madv_populate memfd_secret ksm_tests ksm_functional_tests soft-dirty cow However, of this set, when launched by run_vmtests.sh, some of the programs are invoked multiple times with different arguments. When invoked by run_kselftest.sh, they are invoked without arguments (and as a consequence, some fail immediately). Some test programs are only launched by run_vmtests.sh: test_vmalloc.sh And some test programs and only launched by run_kselftest.sh: khugepaged migration mkdirty transhuge-stress split_huge_page_test mdwe_test write_to_hugetlbfs Furthermore, run_vmtests.sh is invoked by run_kselftest.sh, so in this case all the test programs invoked by both scripts are run twice! Needless to say, this is a bit of a mess. In the absence of fully understanding the history here, it looks to me like the best solution is to launch ALL test programs from run_vmtests.sh, and ONLY invoke run_vmtests.sh from run_kselftest.sh. This way, we get full control over the parameters, each program is only invoked the intended number of times, and regardless of which script is used, the same tests get run in the same way. The only drawback is that if using run_kselftest.sh, it's top-level tap result reporting reports only a single test and it fails if any of the contained tests fail. I don't see this as a big deal though since we still see all the nested reporting from multiple layers. The other issue with this is that all of run_vmtests.sh must execute within a single kselftest timeout period, so let's increase that to something more suitable. In the Makefile, TEST_GEN_PROGS will compile and install the tests and will add them to the list of tests that run_kselftest.sh will run. TEST_GEN_FILES will compile and install the tests but will not add them to the test list. So let's move all the programs from TEST_GEN_PROGS to TEST_GEN_FILES so that they are built but not executed by run_kselftest.sh. Note that run_vmtests.sh is added to TEST_PROGS, which means it ends up in the test list. (the lack of "_GEN" means it won't be compiled, but simply copied). Link: https://lkml.kernel.org/r/20230724082522.1202616-9-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: David Hildenbrand Acked-by: Peter Xu Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Mark Brown Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 79 ++++++++++++----------- tools/testing/selftests/mm/run_vmtests.sh | 23 +++++++ tools/testing/selftests/mm/settings | 2 +- 3 files changed, 64 insertions(+), 40 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index f35977a2bbf3fe..6a9fc5693145f3 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -35,39 +35,39 @@ MAKEFLAGS += --no-builtin-rules CFLAGS = -Wall -I $(top_srcdir) $(EXTRA_CFLAGS) $(KHDR_INCLUDES) LDLIBS = -lrt -lpthread -TEST_GEN_PROGS = cow -TEST_GEN_PROGS += compaction_test -TEST_GEN_PROGS += gup_longterm -TEST_GEN_PROGS += gup_test -TEST_GEN_PROGS += hmm-tests -TEST_GEN_PROGS += hugetlb-madvise -TEST_GEN_PROGS += hugetlb-read-hwpoison -TEST_GEN_PROGS += hugepage-mmap -TEST_GEN_PROGS += hugepage-mremap -TEST_GEN_PROGS += hugepage-shm -TEST_GEN_PROGS += hugepage-vmemmap -TEST_GEN_PROGS += khugepaged -TEST_GEN_PROGS += madv_populate -TEST_GEN_PROGS += map_fixed_noreplace -TEST_GEN_PROGS += map_hugetlb -TEST_GEN_PROGS += map_populate -TEST_GEN_PROGS += memfd_secret -TEST_GEN_PROGS += migration -TEST_GEN_PROGS += mkdirty -TEST_GEN_PROGS += mlock-random-test -TEST_GEN_PROGS += mlock2-tests -TEST_GEN_PROGS += mrelease_test -TEST_GEN_PROGS += mremap_dontunmap -TEST_GEN_PROGS += mremap_test -TEST_GEN_PROGS += on-fault-limit -TEST_GEN_PROGS += thuge-gen -TEST_GEN_PROGS += transhuge-stress -TEST_GEN_PROGS += uffd-stress -TEST_GEN_PROGS += uffd-unit-tests -TEST_GEN_PROGS += split_huge_page_test -TEST_GEN_PROGS += ksm_tests -TEST_GEN_PROGS += ksm_functional_tests -TEST_GEN_PROGS += mdwe_test +TEST_GEN_FILES = cow +TEST_GEN_FILES += compaction_test +TEST_GEN_FILES += gup_longterm +TEST_GEN_FILES += gup_test +TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise +TEST_GEN_FILES += hugetlb-read-hwpoison +TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap +TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap +TEST_GEN_FILES += khugepaged +TEST_GEN_FILES += madv_populate +TEST_GEN_FILES += map_fixed_noreplace +TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret +TEST_GEN_FILES += migration +TEST_GEN_FILES += mkdirty +TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mrelease_test +TEST_GEN_FILES += mremap_dontunmap +TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += on-fault-limit +TEST_GEN_FILES += thuge-gen +TEST_GEN_FILES += transhuge-stress +TEST_GEN_FILES += uffd-stress +TEST_GEN_FILES += uffd-unit-tests +TEST_GEN_FILES += split_huge_page_test +TEST_GEN_FILES += ksm_tests +TEST_GEN_FILES += ksm_functional_tests +TEST_GEN_FILES += mdwe_test ifneq ($(ARCH),arm64) TEST_GEN_PROGS += soft-dirty @@ -87,24 +87,24 @@ CFLAGS += -no-pie endif ifeq ($(CAN_BUILD_I386),1) -TEST_GEN_PROGS += $(BINARIES_32) +TEST_GEN_FILES += $(BINARIES_32) endif ifeq ($(CAN_BUILD_X86_64),1) -TEST_GEN_PROGS += $(BINARIES_64) +TEST_GEN_FILES += $(BINARIES_64) endif else ifneq (,$(findstring $(ARCH),ppc64)) -TEST_GEN_PROGS += protection_keys +TEST_GEN_FILES += protection_keys endif endif ifneq (,$(filter $(ARCH),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sparc64 x86_64)) -TEST_GEN_PROGS += va_high_addr_switch -TEST_GEN_PROGS += virtual_address_range -TEST_GEN_PROGS += write_to_hugetlbfs +TEST_GEN_FILES += va_high_addr_switch +TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += write_to_hugetlbfs endif TEST_PROGS := run_vmtests.sh @@ -116,6 +116,7 @@ TEST_FILES += va_high_addr_switch.sh include ../lib.mk $(TEST_GEN_PROGS): vm_util.c +$(TEST_GEN_FILES): vm_util.c $(OUTPUT)/uffd-stress: uffd-common.c $(OUTPUT)/uffd-unit-tests: uffd-common.c diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 4eaefc3d2e59ad..6de90c0adf92ce 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -58,6 +58,17 @@ separated by spaces: test soft dirty page bit semantics - cow test copy-on-write semantics +- thp + test transparent huge pages +- migration + invoke move_pages(2) to exercise the migration entry code + paths in the kernel +- mkdirty + test handling of code that might set PTE/PMD dirty in + read-only VMAs +- mdwe + test prctl(PR_SET_MDWE, ...) + example: ./run_vmtests.sh -t "hmm mmap ksm" EOF exit 0 @@ -330,6 +341,18 @@ fi # COW tests CATEGORY="cow" run_test ./cow +CATEGORY="thp" run_test ./khugepaged + +CATEGORY="thp" run_test ./transhuge-stress -d 20 + +CATEGORY="thp" run_test ./split_huge_page_test + +CATEGORY="migration" run_test ./migration + +CATEGORY="mkdirty" run_test ./mkdirty + +CATEGORY="mdwe" run_test ./mdwe_test + echo "SUMMARY: PASS=${count_pass} SKIP=${count_skip} FAIL=${count_fail}" exit $exitcode diff --git a/tools/testing/selftests/mm/settings b/tools/testing/selftests/mm/settings index ba4d85f74cd6b9..a953c96aa16e1e 100644 --- a/tools/testing/selftests/mm/settings +++ b/tools/testing/selftests/mm/settings @@ -1 +1 @@ -timeout=90 +timeout=180 From eafcb7a972e2039e0f8a8c0d4fd3e90e485c8b9c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sun, 23 Jul 2023 11:31:14 +0800 Subject: [PATCH 189/489] mm/mprotect: fix obsolete function name in change_pte_range() Since commit 79a1971c5f14 ("mm: move the copy_one_pte() pte_present check into the caller"), the explanation of preserving soft-dirtiness is moved into copy_nonpresent_pte(). Update corresponding comment. Link: https://lkml.kernel.org/r/20230723033114.3224409-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 5c3112d9246648..3f36c88a238e97 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -213,7 +213,7 @@ static long change_pte_range(struct mmu_gather *tlb, } else if (is_writable_device_private_entry(entry)) { /* * We do not preserve soft-dirtiness. See - * copy_one_pte() for explanation. + * copy_nonpresent_pte() for explanation. */ entry = make_readable_device_private_entry( swp_offset(entry)); From e6bd14eca207bf822b7c743818ba6e04889348ec Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 21 Jul 2023 23:09:56 +0800 Subject: [PATCH 190/489] mm/compaction: correct comment of candidate pfn in fast_isolate_freepages Patch series "Two minor cleanups for compaction", v2. This series contains two random cleanups for compaction. This patch (of 2): If no preferred one was not found, we will use candidate page with maximum pfn > min_pfn which is saved in high_pfn. Correct "minimum" to "maximum candidate" in comment. Link: https://lkml.kernel.org/r/20230721150957.2058634-1-shikemeng@huawei.com Link: https://lkml.kernel.org/r/20230721150957.2058634-2-shikemeng@huawei.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 02239abed6ce40..480f29bcf823b0 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1554,7 +1554,7 @@ static void fast_isolate_freepages(struct compact_control *cc) break; } - /* Use a minimum pfn if a preferred one was not found */ + /* Use a maximum candidate pfn if a preferred one was not found */ if (!page && high_pfn) { page = pfn_to_page(high_pfn); From 3c099a2b0b53d98552cd69d19fd76049bcbafe38 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 21 Jul 2023 23:09:57 +0800 Subject: [PATCH 191/489] mm/compaction: avoid unneeded pageblock_end_pfn when no_set_skip_hint is set Move pageblock_end_pfn after no_set_skip_hint check to avoid unneeded pageblock_end_pfn if no_set_skip_hint is set. Link: https://lkml.kernel.org/r/20230721150957.2058634-3-shikemeng@huawei.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 480f29bcf823b0..e6ac0ef4c178f1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -463,12 +463,12 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { struct zone *zone = cc->zone; - pfn = pageblock_end_pfn(pfn); - /* Set for isolation rather than compaction */ if (cc->no_set_skip_hint) return; + pfn = pageblock_end_pfn(pfn); + if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; if (cc->mode != MIGRATE_ASYNC && From 479c33049116f2d138b4dfec328961881cc26b33 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:42 +0800 Subject: [PATCH 192/489] mm/page_io: remove unneeded ClearPageUptodate() Patch series "Convert several functions in page_io.c to use a folio", v4. Convert several functions in page_io.c to use a folio, which can remove several implicit calls to compound_head(). This patch (of 10): The VM_BUG_ON_FOLIO in swap_readpage() ensures that the page is already !uptodate in __end_swap_bio_read() and sio_read_complete(). Just remove unneeded ClearPageUptodate(). Link: https://lkml.kernel.org/r/20230721034451.16412-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20230721034451.16412-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 684cd3c7b59b0c..4283aeeae0b7b5 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -62,7 +62,6 @@ static void __end_swap_bio_read(struct bio *bio) if (bio->bi_status) { SetPageError(page); - ClearPageUptodate(page); pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); @@ -417,7 +416,6 @@ static void sio_read_complete(struct kiocb *iocb, long ret) struct page *page = sio->bvec[p].bv_page; SetPageError(page); - ClearPageUptodate(page); unlock_page(page); } pr_alert_ratelimited("Read-error on swap-device\n"); From 9962ed64bd2154863ab3b63b15a2b55e39dc7117 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:43 +0800 Subject: [PATCH 193/489] mm/page_io: remove unneeded SetPageError() Nobody checks the PageError()/folio_test_error() for the page/folio in __end_swap_bio_read/write() and sio_write_complete(). Therefore, we don't need to set the error flag. Just drop it. Link: https://lkml.kernel.org/r/20230721034451.16412-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 4283aeeae0b7b5..a34f2cd608f75f 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -32,7 +32,6 @@ static void __end_swap_bio_write(struct bio *bio) struct page *page = bio_first_page_all(bio); if (bio->bi_status) { - SetPageError(page); /* * We failed to write the page out to swap-space. * Re-dirty the page in order to avoid it being reclaimed. @@ -61,7 +60,6 @@ static void __end_swap_bio_read(struct bio *bio) struct page *page = bio_first_page_all(bio); if (bio->bi_status) { - SetPageError(page); pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); @@ -415,7 +413,6 @@ static void sio_read_complete(struct kiocb *iocb, long ret) for (p = 0; p < sio->pages; p++) { struct page *page = sio->bvec[p].bv_page; - SetPageError(page); unlock_page(page); } pr_alert_ratelimited("Read-error on swap-device\n"); From 6d2790d95d7cffaf0def36270032ce5228dd43a5 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:44 +0800 Subject: [PATCH 194/489] mm/page_io: introduce bio_first_folio_all() Introduce bio_first_folio_all() to return a folio, which makes it easier to use. Link: https://lkml.kernel.org/r/20230721034451.16412-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Suggested-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- Documentation/block/biovecs.rst | 1 + include/linux/bio.h | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/Documentation/block/biovecs.rst b/Documentation/block/biovecs.rst index ddb867e0185b41..b9dc0c9dbee448 100644 --- a/Documentation/block/biovecs.rst +++ b/Documentation/block/biovecs.rst @@ -134,6 +134,7 @@ Usage of helpers: bio_for_each_bvec_all() bio_first_bvec_all() bio_first_page_all() + bio_first_folio_all() bio_last_bvec_all() * The following helpers iterate over single-page segment. The passed 'struct diff --git a/include/linux/bio.h b/include/linux/bio.h index c4f5b5228105fe..027ff9ab5d12d5 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -253,6 +253,11 @@ static inline struct page *bio_first_page_all(struct bio *bio) return bio_first_bvec_all(bio)->bv_page; } +static inline struct folio *bio_first_folio_all(struct bio *bio) +{ + return page_folio(bio_first_page_all(bio)); +} + static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); From a3ed1e9b63a2703caab4fe63ddb560991a5f618c Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:45 +0800 Subject: [PATCH 195/489] mm/page_io: use a folio in __end_swap_bio_write() Saves two implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-5-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index a34f2cd608f75f..f2cc5ebb89b21b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -29,7 +29,7 @@ static void __end_swap_bio_write(struct bio *bio) { - struct page *page = bio_first_page_all(bio); + struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { /* @@ -40,13 +40,13 @@ static void __end_swap_bio_write(struct bio *bio) * * Also clear PG_reclaim to avoid folio_rotate_reclaimable() */ - set_page_dirty(page); + folio_mark_dirty(folio); pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); - ClearPageReclaim(page); + folio_clear_reclaim(folio); } - end_page_writeback(page); + folio_end_writeback(folio); } static void end_swap_bio_write(struct bio *bio) From bc74b53f29e1025a08e97f8e507968608a567f26 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:46 +0800 Subject: [PATCH 196/489] mm/page_io: use a folio in __end_swap_bio_read() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-6-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index f2cc5ebb89b21b..6faaaa9a6c3b53 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -57,16 +57,16 @@ static void end_swap_bio_write(struct bio *bio) static void __end_swap_bio_read(struct bio *bio) { - struct page *page = bio_first_page_all(bio); + struct folio *folio = bio_first_folio_all(bio); if (bio->bi_status) { pr_alert_ratelimited("Read-error on swap-device (%u:%u:%llu)\n", MAJOR(bio_dev(bio)), MINOR(bio_dev(bio)), (unsigned long long)bio->bi_iter.bi_sector); } else { - SetPageUptodate(page); + folio_mark_uptodate(folio); } - unlock_page(page); + folio_unlock(folio); } static void end_swap_bio_read(struct bio *bio) From 6a8c068774ad7634b43bebd97182141765398835 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:47 +0800 Subject: [PATCH 197/489] mm/page_io: use a folio in sio_read_complete() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-7-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 6faaaa9a6c3b53..979d9ce7d9cde7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -403,17 +403,17 @@ static void sio_read_complete(struct kiocb *iocb, long ret) if (ret == sio->len) { for (p = 0; p < sio->pages; p++) { - struct page *page = sio->bvec[p].bv_page; + struct folio *folio = page_folio(sio->bvec[p].bv_page); - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } count_vm_events(PSWPIN, sio->pages); } else { for (p = 0; p < sio->pages; p++) { - struct page *page = sio->bvec[p].bv_page; + struct folio *folio = page_folio(sio->bvec[p].bv_page); - unlock_page(page); + folio_unlock(folio); } pr_alert_ratelimited("Read-error on swap-device\n"); } From f54fcaabd34b98921ec12501d0507e1fa1ae831b Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:48 +0800 Subject: [PATCH 198/489] mm/page_io: use a folio in swap_writepage_bdev_sync() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-8-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 979d9ce7d9cde7..fc6e1a41744c56 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -331,6 +331,7 @@ static void swap_writepage_bdev_sync(struct page *page, { struct bio_vec bv; struct bio bio; + struct folio *folio = page_folio(page); bio_init(&bio, sis->bdev, &bv, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc)); @@ -340,8 +341,8 @@ static void swap_writepage_bdev_sync(struct page *page, bio_associate_blkg_from_page(&bio, page); count_swpout_vm_event(page); - set_page_writeback(page); - unlock_page(page); + folio_start_writeback(folio); + folio_unlock(folio); submit_bio_wait(&bio); __end_swap_bio_write(&bio); From 2675251d5037c308a03f8ad1545b4169522cb950 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:49 +0800 Subject: [PATCH 199/489] mm/page_io: use a folio in swap_writepage_bdev_async() Saves one implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230721034451.16412-9-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index fc6e1a41744c56..886addfabc70f8 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -352,6 +352,7 @@ static void swap_writepage_bdev_async(struct page *page, struct writeback_control *wbc, struct swap_info_struct *sis) { struct bio *bio; + struct folio *folio = page_folio(page); bio = bio_alloc(sis->bdev, 1, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), @@ -362,8 +363,8 @@ static void swap_writepage_bdev_async(struct page *page, bio_associate_blkg_from_page(bio, page); count_swpout_vm_event(page); - set_page_writeback(page); - unlock_page(page); + folio_start_writeback(folio); + folio_unlock(folio); submit_bio(bio); } From 9b72b134eedc6fbdf7b59c9c4764a57d14b2fea7 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:50 +0800 Subject: [PATCH 200/489] mm/page_io: convert count_swpout_vm_event() to take in a folio Convert count_swpout_vm_event() to take in a folio. We can remove five implicit calls to compound_head() by taking in a folio. Link: https://lkml.kernel.org/r/20230721034451.16412-10-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 886addfabc70f8..8b3845f4433102 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -205,13 +205,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) return 0; } -static inline void count_swpout_vm_event(struct page *page) +static inline void count_swpout_vm_event(struct folio *folio) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (unlikely(PageTransHuge(page))) + if (unlikely(folio_test_pmd_mappable(folio))) count_vm_event(THP_SWPOUT); #endif - count_vm_events(PSWPOUT, thp_nr_pages(page)); + count_vm_events(PSWPOUT, folio_nr_pages(folio)); } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) @@ -280,7 +280,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret) } } else { for (p = 0; p < sio->pages; p++) - count_swpout_vm_event(sio->bvec[p].bv_page); + count_swpout_vm_event(page_folio(sio->bvec[p].bv_page)); } for (p = 0; p < sio->pages; p++) @@ -339,7 +339,7 @@ static void swap_writepage_bdev_sync(struct page *page, __bio_add_page(&bio, page, thp_size(page), 0); bio_associate_blkg_from_page(&bio, page); - count_swpout_vm_event(page); + count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); @@ -362,7 +362,7 @@ static void swap_writepage_bdev_async(struct page *page, __bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); - count_swpout_vm_event(page); + count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); submit_bio(bio); From 98630cfdc4221e1455e13c1bd423d029c888dca6 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 21 Jul 2023 11:44:51 +0800 Subject: [PATCH 201/489] mm/page_io: convert bio_associate_blkg_from_page() to take in a folio Convert bio_associate_blkg_from_page() to take in a folio. We can remove two implicit calls to compound_head() by taking in a folio. Link: https://lkml.kernel.org/r/20230721034451.16412-11-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Kefeng Wang Cc: Nanyong Sun Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- mm/page_io.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 8b3845f4433102..ff4156a44d5d72 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -215,12 +215,12 @@ static inline void count_swpout_vm_event(struct folio *folio) } #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) -static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) +static void bio_associate_blkg_from_page(struct bio *bio, struct folio *folio) { struct cgroup_subsys_state *css; struct mem_cgroup *memcg; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return; @@ -230,7 +230,7 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) rcu_read_unlock(); } #else -#define bio_associate_blkg_from_page(bio, page) do { } while (0) +#define bio_associate_blkg_from_page(bio, folio) do { } while (0) #endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */ struct swap_iocb { @@ -338,7 +338,7 @@ static void swap_writepage_bdev_sync(struct page *page, bio.bi_iter.bi_sector = swap_page_sector(page); __bio_add_page(&bio, page, thp_size(page), 0); - bio_associate_blkg_from_page(&bio, page); + bio_associate_blkg_from_page(&bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); @@ -361,7 +361,7 @@ static void swap_writepage_bdev_async(struct page *page, bio->bi_end_io = end_swap_bio_write; __bio_add_page(bio, page, thp_size(page), 0); - bio_associate_blkg_from_page(bio, page); + bio_associate_blkg_from_page(bio, folio); count_swpout_vm_event(folio); folio_start_writeback(folio); folio_unlock(folio); From 90717566f8f6b6761494ccfff43ea62af443fc9b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Thu, 20 Jul 2023 21:34:36 +0200 Subject: [PATCH 202/489] mm: don't drop VMA locks in mm_drop_all_locks() Despite its name, mm_drop_all_locks() does not drop _all_ locks; the mmap lock is held write-locked by the caller, and the caller is responsible for dropping the mmap lock at a later point (which will also release the VMA locks). Calling vma_end_write_all() here is dangerous because the caller might have write-locked a VMA with the expectation that it will stay write-locked until the mmap_lock is released, as usual. This _almost_ becomes a problem in the following scenario: An anonymous VMA A and an SGX VMA B are mapped adjacent to each other. Userspace calls munmap() on a range starting at the start address of A and ending in the middle of B. Hypothetical call graph with additional notes in brackets: do_vmi_align_munmap [begin first for_each_vma_range loop] vma_start_write [on VMA A] vma_mark_detached [on VMA A] __split_vma [on VMA B] sgx_vma_open [== new->vm_ops->open] sgx_encl_mm_add __mmu_notifier_register [luckily THIS CAN'T ACTUALLY HAPPEN] mm_take_all_locks mm_drop_all_locks vma_end_write_all [drops VMA lock taken on VMA A before] vma_start_write [on VMA B] vma_mark_detached [on VMA B] [end first for_each_vma_range loop] vma_iter_clear_gfp [removes VMAs from maple tree] mmap_write_downgrade unmap_region mmap_read_unlock In this hypothetical scenario, while do_vmi_align_munmap() thinks it still holds a VMA write lock on VMA A, the VMA write lock has actually been invalidated inside __split_vma(). The call from sgx_encl_mm_add() to __mmu_notifier_register() can't actually happen here, as far as I understand, because we are duplicating an existing SGX VMA, but sgx_encl_mm_add() only calls __mmu_notifier_register() for the first SGX VMA created in a given process. So this could only happen in fork(), not on munmap(). But in my view it is just pure luck that this can't happen. Also, we wouldn't actually have any bad consequences from this in do_vmi_align_munmap(), because by the time the bug drops the lock on VMA A, we've already marked VMA A as detached, which makes it completely ineligible for any VMA-locked page faults. But again, that's just pure luck. So remove the vma_end_write_all(), so that VMA write locks are only ever released on mmap_write_unlock() or mmap_write_downgrade(). Also add comments to document the locking rules established by this patch. Link: https://lkml.kernel.org/r/20230720193436.454247-1-jannh@google.com Fixes: eeff9a5d47f8 ("mm/mmap: prevent pagefault handler from racing with mmu_notifier registration") Signed-off-by: Jann Horn Reviewed-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +++++ include/linux/mmap_lock.h | 8 ++++++++ mm/mmap.c | 7 ++++++- 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 84988d4ff8fb2b..93eb291181f799 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -691,6 +691,11 @@ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) return (vma->vm_lock_seq == *mm_lock_seq); } +/* + * Begin writing to a VMA. + * Exclude concurrent readers under the per-VMA lock until the currently + * write-locked mmap_lock is dropped or downgraded. + */ static inline void vma_start_write(struct vm_area_struct *vma) { int mm_lock_seq; diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index a5c63b6d7d46c1..8d38dcb6d044cb 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -73,6 +73,14 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) } #ifdef CONFIG_PER_VMA_LOCK +/* + * Drop all currently-held per-VMA locks. + * This is called from the mmap_lock implementation directly before releasing + * a write-locked mmap_lock (or downgrading it to read-locked). + * This should normally NOT be called manually from other places. + * If you want to call this manually anyway, keep in mind that this will release + * *all* VMA write locks, including ones from further up the stack. + */ static inline void vma_end_write_all(struct mm_struct *mm) { mmap_assert_write_locked(mm); diff --git a/mm/mmap.c b/mm/mmap.c index 7bd1caa09dddf0..4a9466b7664837 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3642,6 +3642,12 @@ int mm_take_all_locks(struct mm_struct *mm) mutex_lock(&mm_all_locks_mutex); + /* + * vma_start_write() does not have a complement in mm_drop_all_locks() + * because vma_start_write() is always asymmetrical; it marks a VMA as + * being written to until mmap_write_unlock() or mmap_write_downgrade() + * is reached. + */ mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; @@ -3738,7 +3744,6 @@ void mm_drop_all_locks(struct mm_struct *mm) if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } - vma_end_write_all(mm); mutex_unlock(&mm_all_locks_mutex); } From 361c678be709f67a5b609ec3666ff5fec7eb8baf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:43 -0400 Subject: [PATCH 203/489] maple_tree: add benchmarking for mas_for_each Patch series "Reduce preallocations for maple tree", v3. Initial work on preallocations showed no regression in performance during testing, but recently some users (both on [1] and off [android] list) have reported that preallocating the worst-case number of nodes has caused some slow down. This patch set addresses the number of allocations in a few ways. During munmap() most munmap() operations will remove a single VMA, so leverage the fact that the maple tree can place a single pointer at range 0 - 0 without allocating. This is done by changing the index of the VMAs to be indexed by the count, starting at 0. Re-introduce the entry argument to mas_preallocate() so that a more intelligent guess of the node count can be made. Implement the more intelligent guess of the node count, although there is more work to be done. During development of v2 of this patch set, I also noticed that the number of nodes being allocated for a rebalance was beyond what could possibly be needed. This is addressed in patch 0008. This patch (of 15): Add a way to test the speed of mas_for_each() to the testing code. Link: https://lkml.kernel.org/r/20230724183157.3939892-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230724183157.3939892-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 3207c21079184a..9c4cf5fb2b7fb4 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -44,6 +44,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_WALK */ /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ +/* #define BENCH_MAS_FOR_EACH */ #ifdef __KERNEL__ #define mt_set_non_kernel(x) do {} while (0) @@ -1770,6 +1771,37 @@ static noinline void __init bench_mt_for_each(struct maple_tree *mt) } #endif +#if defined(BENCH_MAS_FOR_EACH) +static noinline void __init bench_mas_for_each(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500; + void *entry; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < max; i += 5) { + int gap = 4; + + if (i % 30 == 0) + gap = 3; + mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL); + } + + rcu_read_lock(); + for (i = 0; i < count; i++) { + unsigned long j = 0; + + mas_for_each(&mas, entry, max) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j += 5; + } + mas_set(&mas, 0); + } + rcu_read_unlock(); + +} +#endif + /* check_forking - simulate the kernel forking sequence with the tree. */ static noinline void __init check_forking(struct maple_tree *mt) { @@ -3498,6 +3530,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_MAS_FOR_EACH) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mas_for_each(&tree); + mtree_destroy(&tree); + goto skip; +#endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); From 8c314f3b55fbc42284ea1367bb2807f2accad8ae Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:44 -0400 Subject: [PATCH 204/489] maple_tree: add benchmarking for mas_prev() Add some benchmarking functions in testing for mas_prev(). This is useful to ensure there are no regressions added during modifications. Link: https://lkml.kernel.org/r/20230724183157.3939892-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 9c4cf5fb2b7fb4..0674aebd44230d 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -45,6 +45,7 @@ atomic_t maple_tree_tests_passed; /* #define BENCH_MT_FOR_EACH */ /* #define BENCH_FORK */ /* #define BENCH_MAS_FOR_EACH */ +/* #define BENCH_MAS_PREV */ #ifdef __KERNEL__ #define mt_set_non_kernel(x) do {} while (0) @@ -1801,7 +1802,36 @@ static noinline void __init bench_mas_for_each(struct maple_tree *mt) } #endif +#if defined(BENCH_MAS_PREV) +static noinline void __init bench_mas_prev(struct maple_tree *mt) +{ + int i, count = 1000000; + unsigned long max = 2500; + void *entry; + MA_STATE(mas, mt, 0, 0); + + for (i = 0; i < max; i += 5) { + int gap = 4; + + if (i % 30 == 0) + gap = 3; + mtree_store_range(mt, i, i + gap, xa_mk_value(i), GFP_KERNEL); + } + + rcu_read_lock(); + for (i = 0; i < count; i++) { + unsigned long j = 2495; + + mas_set(&mas, ULONG_MAX); + while ((entry = mas_prev(&mas, 0)) != NULL) { + MT_BUG_ON(mt, entry != xa_mk_value(j)); + j -= 5; + } + } + rcu_read_unlock(); +} +#endif /* check_forking - simulate the kernel forking sequence with the tree. */ static noinline void __init check_forking(struct maple_tree *mt) { @@ -3537,6 +3567,13 @@ static int __init maple_tree_seed(void) mtree_destroy(&tree); goto skip; #endif +#if defined(BENCH_MAS_PREV) +#define BENCH + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + bench_mas_prev(&tree); + mtree_destroy(&tree); + goto skip; +#endif mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_iteration(&tree); From fd892593d44d8b649caf30a67f0c7696d976d901 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:45 -0400 Subject: [PATCH 205/489] mm: change do_vmi_align_munmap() tracking of VMAs to remove The majority of the calls to munmap a vm range is within a single vma. The maple tree is able to store a single entry at 0, with a size of 1 as a pointer and avoid any allocations. Change do_vmi_align_munmap() to store the VMAs being munmap()'ed into a tree indexed by the count. This will leverage the ability to store the first entry without a node allocation. Storing the entries into a tree by the count and not the vma start and end means changing the functions which iterate over the entries. Update unmap_vmas() and free_pgtables() to take a maple state and a tree end address to support this functionality. Passing through the same maple state to unmap_vmas() and free_pgtables() means the state needs to be reset between calls. This happens in the static unmap_region() and exit_mmap(). Link: https://lkml.kernel.org/r/20230724183157.3939892-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- mm/internal.h | 2 +- mm/memory.c | 16 +++++++--------- mm/mmap.c | 41 ++++++++++++++++++++++++----------------- 4 files changed, 34 insertions(+), 29 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 93eb291181f799..ded514ee2588dc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2287,9 +2287,9 @@ static inline void zap_vma_pages(struct vm_area_struct *vma) zap_page_range_single(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL); } -void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long start, - unsigned long end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end, bool mm_wr_locked); struct mmu_notifier_range; diff --git a/mm/internal.h b/mm/internal.h index 483add0bfb289d..7d11ebe5d11c47 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -109,7 +109,7 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, +void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); diff --git a/mm/memory.c b/mm/memory.c index 3e16f06373765f..ed4807deec893b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -361,12 +361,10 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, +void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { - MA_STATE(mas, mt, vma->vm_end, vma->vm_end); - do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -375,7 +373,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, * Note: USER_PGTABLES_CEILING may be passed as ceiling and may * be 0. This will underflow and is okay. */ - next = mas_find(&mas, ceiling - 1); + next = mas_find(mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -396,7 +394,7 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = mas_find(&mas, ceiling - 1); + next = mas_find(mas, ceiling - 1); if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); @@ -1713,9 +1711,10 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, +void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end, + bool mm_wr_locked) { struct mmu_notifier_range range; struct zap_details details = { @@ -1723,7 +1722,6 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, /* Careful - we need to zap private pages too! */ .even_cows = true, }; - MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, start_addr, end_addr); @@ -1731,7 +1729,7 @@ void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details, mm_wr_locked); - } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); + } while ((vma = mas_find(mas, tree_end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } diff --git a/mm/mmap.c b/mm/mmap.c index 4a9466b7664837..5212a0b66b8f10 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -76,10 +76,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, +static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next, unsigned long start, - unsigned long end, bool mm_wr_locked); + unsigned long end, unsigned long tree_end, bool mm_wr_locked); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -2293,18 +2293,20 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, +static void unmap_region(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, - struct vm_area_struct *next, - unsigned long start, unsigned long end, bool mm_wr_locked) + struct vm_area_struct *next, unsigned long start, + unsigned long end, unsigned long tree_end, bool mm_wr_locked) { struct mmu_gather tlb; + unsigned long mt_start = mas->index; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); - free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked); + mas_set(mas, mt_start); + free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING, mm_wr_locked); tlb_finish_mmu(&tlb); @@ -2472,7 +2474,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, goto end_split_failed; } vma_start_write(next); - mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); + mas_set(&mas_detach, count); error = mas_store_gfp(&mas_detach, next, GFP_KERNEL); if (error) goto munmap_gather_failed; @@ -2511,17 +2513,17 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ { - MA_STATE(test, &mt_detach, start, end - 1); + MA_STATE(test, &mt_detach, 0, 0); struct vm_area_struct *vma_mas, *vma_test; int test_count = 0; vma_iter_set(vmi, start); rcu_read_lock(); - vma_test = mas_find(&test, end - 1); + vma_test = mas_find(&test, count - 1); for_each_vma_range(*vmi, vma_mas, end) { BUG_ON(vma_mas != vma_test); test_count++; - vma_test = mas_next(&test, end - 1); + vma_test = mas_next(&test, count - 1); } rcu_read_unlock(); BUG_ON(count != test_count); @@ -2542,9 +2544,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. */ - unmap_region(mm, &mt_detach, vma, prev, next, start, end, !unlock); + mas_set(&mas_detach, 1); + unmap_region(mm, &mas_detach, vma, prev, next, start, end, count, + !unlock); /* Statistics and freeing VMAs */ - mas_set(&mas_detach, start); + mas_set(&mas_detach, 0); remove_mt(mm, &mas_detach); validate_mm(mm); if (unlock) @@ -2864,9 +2868,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr, fput(vma->vm_file); vma->vm_file = NULL; + vma_iter_set(&vmi, vma->vm_end); /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, - vma->vm_end, true); + unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start, + vma->vm_end, vma->vm_end, true); } if (file && (vm_flags & VM_SHARED)) mapping_unmap_writable(file->f_mapping); @@ -3185,7 +3190,7 @@ void exit_mmap(struct mm_struct *mm) tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX, false); + unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false); mmap_read_unlock(mm); /* @@ -3195,7 +3200,8 @@ void exit_mmap(struct mm_struct *mm) set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); mt_clear_in_rcu(&mm->mm_mt); - free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + mas_set(&mas, vma->vm_end); + free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING, true); tlb_finish_mmu(&tlb); @@ -3204,6 +3210,7 @@ void exit_mmap(struct mm_struct *mm) * enabled, without holding any MM locks besides the unreachable * mmap_write_lock. */ + mas_set(&mas, vma->vm_end); do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); From 445a2ea0ef0e0e69812218b2c896a23443466625 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:46 -0400 Subject: [PATCH 206/489] mm: remove prev check from do_vmi_align_munmap() If the prev does not exist, the vma iterator will be set to MAS_NONE, which will be treated as a MAS_START when the mas_next or mas_find is used. In this case, the next caller will be the vma iterator, which uses mas_find() under the hood and will now do what the user expects. Link: https://lkml.kernel.org/r/20230724183157.3939892-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5212a0b66b8f10..5fbc7d71d60c60 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2459,8 +2459,6 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } prev = vma_prev(vmi); - if (unlikely((!prev))) - vma_iter_set(vmi, start); /* * Detach a range of VMAs from the mm. Using next as a temp variable as From c1297987cc2ada57a7faea7985c2334548d110f9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:47 -0400 Subject: [PATCH 207/489] maple_tree: introduce __mas_set_range() mas_set_range() resets the node to MAS_START, which will cause a re-walk of the tree to the range. This is unnecessary when the maple state is already at the correct location of the write. Add a function that only sets the range to avoid unnecessary re-walking of the tree. Link: https://lkml.kernel.org/r/20230724183157.3939892-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 949f911bf955c0..e10db656e31c6e 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -539,6 +539,22 @@ static inline void mas_reset(struct ma_state *mas) */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the + * current location. + * @mas: Maple Tree operation state. + * @start: New start of range in the Maple Tree. + * @last: New end of range in the Maple Tree. + * + * set the internal maple state values to a sub-range. + * Please use mas_set_range() if you do not know where you are in the tree. + */ +static inline void __mas_set_range(struct ma_state *mas, unsigned long start, + unsigned long last) +{ + mas->index = start; + mas->last = last; +} /** * mas_set_range() - Set up Maple Tree operation state for a different index. @@ -553,9 +569,8 @@ static inline void mas_reset(struct ma_state *mas) static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { - mas->index = start; - mas->last = last; - mas->node = MAS_START; + __mas_set_range(mas, start, last); + mas->node = MAS_START; } /** From 53bee98d004fcd7062f2fe056720704e947e6000 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:48 -0400 Subject: [PATCH 208/489] mm: remove re-walk from mmap_region() Using vma_iter_set() will reset the tree and cause a re-walk. Use vmi_iter_config() to set the write to a sub-set of the range. Change the file case to also use vmi_iter_config() so that the end is correctly set. Link: https://lkml.kernel.org/r/20230724183157.3939892-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 8 ++++++++ mm/mmap.c | 15 ++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 7d11ebe5d11c47..c5ba08f55deb8a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1041,6 +1041,14 @@ static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) return !(vma->vm_flags & VM_SOFTDIRTY); } +static inline void vma_iter_config(struct vma_iterator *vmi, + unsigned long index, unsigned long last) +{ + MAS_BUG_ON(&vmi->mas, vmi->mas.node != MAS_START && + (vmi->mas.index > index || vmi->mas.last < index)); + __mas_set_range(&vmi->mas, index, last - 1); +} + /* * VMA Iterator functions shared between nommu and mmap */ diff --git a/mm/mmap.c b/mm/mmap.c index 5fbc7d71d60c60..a1a59487390e53 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2676,8 +2676,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr, next = vma_next(&vmi); prev = vma_prev(&vmi); - if (vm_flags & VM_SPECIAL) + if (vm_flags & VM_SPECIAL) { + if (prev) + vma_iter_next_range(&vmi); goto cannot_expand; + } /* Attempt to expand an old mapping */ /* Check next */ @@ -2698,6 +2701,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, merge_start = prev->vm_start; vma = prev; vm_pgoff = prev->vm_pgoff; + } else if (prev) { + vma_iter_next_range(&vmi); } @@ -2708,9 +2713,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto expanded; } + if (vma == prev) + vma_iter_set(&vmi, addr); cannot_expand: - if (prev) - vma_iter_next_range(&vmi); /* * Determine the object being mapped and call the appropriate @@ -2723,7 +2728,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto unacct_error; } - vma_iter_set(&vmi, addr); + vma_iter_config(&vmi, addr, end); vma->vm_start = addr; vma->vm_end = end; vm_flags_init(vma, vm_flags); @@ -2750,7 +2755,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, if (WARN_ON((addr != vma->vm_start))) goto close_and_free_vma; - vma_iter_set(&vmi, addr); + vma_iter_config(&vmi, addr, end); /* * If vm_flags changed after call_mmap(), we should try merge * vma again as we may succeed this time. From da0892547b101df6e13255b378380d077975368d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:49 -0400 Subject: [PATCH 209/489] maple_tree: re-introduce entry to mas_preallocate() arguments The current preallocation strategy is to preallocate the absolute worst-case allocation for a tree modification. The entry (or NULL) is needed to know how many nodes are needed to write to the tree. Start by adding the argument to the mas_preallocate() definition. Link: https://lkml.kernel.org/r/20230724183157.3939892-8-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- lib/maple_tree.c | 3 ++- mm/internal.h | 2 +- mm/mmap.c | 4 ++-- tools/testing/radix-tree/maple.c | 32 ++++++++++++++++---------------- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e10db656e31c6e..c962af1886813f 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -466,7 +466,7 @@ void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); -int mas_preallocate(struct ma_state *mas, gfp_t gfp); +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); bool mas_is_err(struct ma_state *mas); bool mas_nomem(struct ma_state *mas, gfp_t gfp); diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3b6f8c8dac6501..0d7e30c7d99934 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5529,11 +5529,12 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); /** * mas_preallocate() - Preallocate enough nodes for a store operation * @mas: The maple state + * @entry: The entry that will be stored * @gfp: The GFP_FLAGS to use for allocations. * * Return: 0 on success, -ENOMEM if memory could not be allocated. */ -int mas_preallocate(struct ma_state *mas, gfp_t gfp) +int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { int ret; diff --git a/mm/internal.h b/mm/internal.h index c5ba08f55deb8a..65f646c2ccf3d1 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1054,7 +1054,7 @@ static inline void vma_iter_config(struct vma_iterator *vmi, */ static inline int vma_iter_prealloc(struct vma_iterator *vmi) { - return mas_preallocate(&vmi->mas, GFP_KERNEL); + return mas_preallocate(&vmi->mas, NULL, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi, diff --git a/mm/mmap.c b/mm/mmap.c index a1a59487390e53..05e051f8727317 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1960,7 +1960,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ @@ -2050,7 +2050,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } - if (mas_preallocate(&mas, GFP_KERNEL)) + if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; /* We must make sure the anon_vma is allocated. */ diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 9901ae821911a2..c6c1c5109deb91 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35458,7 +35458,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35467,18 +35467,18 @@ static noinline void __init check_prealloc(struct maple_tree *mt) allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35487,26 +35487,26 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35515,12 +35515,12 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); mas_push_node(&mas, mn); MT_BUG_ON(mt, mas_allocated(&mas) != allocated); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); mas_destroy(&mas); allocated = mas_allocated(&mas); MT_BUG_ON(mt, allocated != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35528,21 +35528,21 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35550,14 +35550,14 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); mas_destroy(&mas); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated == 0); @@ -35565,7 +35565,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); - MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL & GFP_NOWAIT) == 0); + MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); MT_BUG_ON(mt, allocated != 0); From c108df767fb786586274b2435473885151d6f360 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:50 -0400 Subject: [PATCH 210/489] maple_tree: adjust node allocation on mas_rebalance() mas_rebalance() is called to rebalance an insufficient node into a single node or two sufficient nodes. The preallocation estimate is always too many in this case as the height of the tree will never grow and there is no possibility to have a three way split in this case, so revise the node allocation count. Link: https://lkml.kernel.org/r/20230724183157.3939892-9-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0d7e30c7d99934..494f884ef17fc4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3136,7 +3136,7 @@ static inline int mas_rebalance(struct ma_state *mas, * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ - mas_node_count(mas, 1 + empty_count * 3); + mas_node_count(mas, empty_count * 2 - 1); if (mas_is_err(mas)) return 0; From f72cf24a868675280bc555e95abc3639777f8d82 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:51 -0400 Subject: [PATCH 211/489] mm: use vma_iter_clear_gfp() in nommu Move the definition of vma_iter_clear_gfp() from mmap.c to internal.h so it can be used in the nommu code. This will reduce node preallocations in nommu. Link: https://lkml.kernel.org/r/20230724183157.3939892-10-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 12 ++++++++++++ mm/mmap.c | 12 ------------ mm/nommu.c | 12 ++++-------- 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 65f646c2ccf3d1..2f35c0ef7b7f76 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1064,6 +1064,18 @@ static inline void vma_iter_clear(struct vma_iterator *vmi, mas_store_prealloc(&vmi->mas, NULL); } +static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, + unsigned long start, unsigned long end, gfp_t gfp) +{ + vmi->mas.index = start; + vmi->mas.last = end - 1; + mas_store_gfp(&vmi->mas, NULL, gfp); + if (unlikely(mas_is_err(&vmi->mas))) + return -ENOMEM; + + return 0; +} + static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); diff --git a/mm/mmap.c b/mm/mmap.c index 05e051f8727317..1ddb024995df2e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -154,18 +154,6 @@ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, return mas_prev(&vmi->mas, min); } -static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, - unsigned long start, unsigned long end, gfp_t gfp) -{ - vmi->mas.index = start; - vmi->mas.last = end - 1; - mas_store_gfp(&vmi->mas, NULL, gfp); - if (unlikely(mas_is_err(&vmi->mas))) - return -ENOMEM; - - return 0; -} - /* * check_brk_limits() - Use platform specific check of range & verify mlock * limits. diff --git a/mm/nommu.c b/mm/nommu.c index 9826f6101a05c0..418cc0669c1f3b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1396,17 +1396,13 @@ static int vmi_shrink_vma(struct vma_iterator *vmi, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - if (vma_iter_prealloc(vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - return -ENOMEM; - } - if (from > vma->vm_start) { - vma_iter_clear(vmi, from, vma->vm_end); + if (vma_iter_clear_gfp(vmi, from, vma->vm_end, GFP_KERNEL)) + return -ENOMEM; vma->vm_end = from; } else { - vma_iter_clear(vmi, vma->vm_start, to); + if (vma_iter_clear_gfp(vmi, vma->vm_start, to, GFP_KERNEL)) + return -ENOMEM; vma->vm_start = to; } From b5df09226450165c434084d346fcb6d4858b0d52 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:52 -0400 Subject: [PATCH 212/489] mm: set up vma iterator for vma_iter_prealloc() calls Set the correct limits for vma_iter_prealloc() calls so that the maple tree can be smarter about how many nodes are needed. Link: https://lkml.kernel.org/r/20230724183157.3939892-11-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- fs/exec.c | 1 + mm/internal.h | 18 ++++++-------- mm/mmap.c | 69 +++++++++++++++++++++++++++++++-------------------- mm/nommu.c | 33 +++++++++++------------- 4 files changed, 64 insertions(+), 57 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 1a827d55ba9475..0b9484358a49d7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -701,6 +701,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) if (vma != vma_next(&vmi)) return -EFAULT; + vma_iter_prev_range(&vmi); /* * cover the whole range: [new_start, old_end) */ diff --git a/mm/internal.h b/mm/internal.h index 2f35c0ef7b7f76..5a03bc4782a28b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1052,23 +1052,21 @@ static inline void vma_iter_config(struct vma_iterator *vmi, /* * VMA Iterator functions shared between nommu and mmap */ -static inline int vma_iter_prealloc(struct vma_iterator *vmi) +static inline int vma_iter_prealloc(struct vma_iterator *vmi, + struct vm_area_struct *vma) { - return mas_preallocate(&vmi->mas, NULL, GFP_KERNEL); + return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } -static inline void vma_iter_clear(struct vma_iterator *vmi, - unsigned long start, unsigned long end) +static inline void vma_iter_clear(struct vma_iterator *vmi) { - mas_set_range(&vmi->mas, start, end - 1); mas_store_prealloc(&vmi->mas, NULL); } static inline int vma_iter_clear_gfp(struct vma_iterator *vmi, unsigned long start, unsigned long end, gfp_t gfp) { - vmi->mas.index = start; - vmi->mas.last = end - 1; + __mas_set_range(&vmi->mas, start, end - 1); mas_store_gfp(&vmi->mas, NULL, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; @@ -1105,8 +1103,7 @@ static inline void vma_iter_store(struct vma_iterator *vmi, ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); - vmi->mas.index = vma->vm_start; - vmi->mas.last = vma->vm_end - 1; + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } @@ -1117,8 +1114,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); - vmi->mas.index = vma->vm_start; - vmi->mas.last = vma->vm_end - 1; + __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; diff --git a/mm/mmap.c b/mm/mmap.c index 1ddb024995df2e..3f10e708ba7245 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -397,7 +397,8 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) VMA_ITERATOR(vmi, mm, 0); struct address_space *mapping = NULL; - if (vma_iter_prealloc(&vmi)) + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; vma_iter_store(&vmi, vma); @@ -649,19 +650,16 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, /* Only handles expanding */ VM_WARN_ON(vma->vm_start < start || vma->vm_end > end); - if (vma_iter_prealloc(vmi)) + /* Note: vma iterator must be pointing to 'start' */ + vma_iter_config(vmi, start, end); + if (vma_iter_prealloc(vmi, vma)) goto nomem; vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - /* VMA iterator points to previous, so set to start if necessary */ - if (vma_iter_addr(vmi) != start) - vma_iter_set(vmi, start); - vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; - /* Note: mas must be pointing to the expanding VMA */ vma_iter_store(vmi, vma); vma_complete(&vp, vmi, vma->vm_mm); @@ -687,19 +685,19 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, WARN_ON((vma->vm_start != start) && (vma->vm_end != end)); - if (vma_iter_prealloc(vmi)) + if (vma->vm_start < start) + vma_iter_config(vmi, vma->vm_start, start); + else + vma_iter_config(vmi, end, vma->vm_end); + + if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); - if (vma->vm_start < start) - vma_iter_clear(vmi, vma->vm_start, start); - - if (vma->vm_end > end) - vma_iter_clear(vmi, end, vma->vm_end); - + vma_iter_clear(vmi); vma->vm_start = start; vma->vm_end = end; vma->vm_pgoff = pgoff; @@ -973,7 +971,17 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (err) return NULL; - if (vma_iter_prealloc(vmi)) + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + + if (vma_expanded) { + vma_iter_config(vmi, vma_start, vma_end); + } else { + vma_iter_config(vmi, adjust->vm_start + adj_start, + adjust->vm_end); + } + + if (vma_iter_prealloc(vmi, vma)) return NULL; init_multi_vma_prep(&vp, vma, adjust, remove, remove2); @@ -982,8 +990,6 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_prepare(&vp); vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start); - if (vma_start < vma->vm_start || vma_end > vma->vm_end) - vma_expanded = true; vma->vm_start = vma_start; vma->vm_end = vma_end; @@ -1923,7 +1929,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) struct vm_area_struct *next; unsigned long gap_addr; int error = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, vma->vm_start, address); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -1948,6 +1954,10 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Check that both stack segments have the same anon_vma? */ } + if (next) + mas_prev_range(&mas, address); + + __mas_set_range(&mas, vma->vm_start, address - 1); if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -1993,7 +2003,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; /* Overwrite old entry in mtree. */ - mas_set_range(&mas, vma->vm_start, address - 1); mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2038,6 +2047,10 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) return -ENOMEM; } + if (prev) + mas_next_range(&mas, vma->vm_start); + + __mas_set_range(&mas, address, vma->vm_end - 1); if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -2084,7 +2097,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) vma->vm_start = address; vma->vm_pgoff -= grow; /* Overwrite old entry in mtree. */ - mas_set_range(&mas, address, vma->vm_end - 1); mas_store_prealloc(&mas, vma); anon_vma_interval_tree_post_update_vma(vma); spin_unlock(&mm->page_table_lock); @@ -2325,10 +2337,6 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) return -ENOMEM; - err = -ENOMEM; - if (vma_iter_prealloc(vmi)) - goto out_free_vma; - if (new_below) { new->vm_end = addr; } else { @@ -2336,6 +2344,11 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } + err = -ENOMEM; + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, new)) + goto out_free_vma; + err = vma_dup_policy(vma, new); if (err) goto out_free_vmi; @@ -2693,7 +2706,6 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_iter_next_range(&vmi); } - /* Actually expand, if possible */ if (vma && !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) { @@ -2790,7 +2802,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto close_and_free_vma; error = -ENOMEM; - if (vma_iter_prealloc(&vmi)) + if (vma_iter_prealloc(&vmi, vma)) goto close_and_free_vma; /* Lock the VMA since it is modified after insertion into VMA tree */ @@ -3053,7 +3065,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma && vma->vm_end == addr && !vma_policy(vma) && can_vma_merge_after(vma, flags, NULL, NULL, addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) { - if (vma_iter_prealloc(vmi)) + vma_iter_config(vmi, vma->vm_start, addr + len); + if (vma_iter_prealloc(vmi, vma)) goto unacct_fail; init_vma_prep(&vp, vma); @@ -3068,6 +3081,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, goto out; } + if (vma) + vma_iter_next_range(vmi); /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); if (!vma) diff --git a/mm/nommu.c b/mm/nommu.c index 418cc0669c1f3b..1fe0ee2398600c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -583,7 +583,8 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) { VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_start); - if (vma_iter_prealloc(&vmi)) { + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) { pr_warn("Allocation of vma tree for process %d failed\n", current->pid); return -ENOMEM; @@ -591,7 +592,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ - vma_iter_clear(&vmi, vma->vm_start, vma->vm_end); + vma_iter_clear(&vmi); return 0; } /* @@ -1054,9 +1055,6 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; - if (vma_iter_prealloc(&vmi)) - goto error_vma_iter_prealloc; - region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1198,6 +1196,10 @@ unsigned long do_mmap(struct file *file, share: BUG_ON(!vma->vm_region); + vma_iter_config(&vmi, vma->vm_start, vma->vm_end); + if (vma_iter_prealloc(&vmi, vma)) + goto error_just_free; + setup_vma_to_mm(vma, current->mm); current->mm->map_count++; /* add the VMA to the tree */ @@ -1244,14 +1246,6 @@ unsigned long do_mmap(struct file *file, len, current->pid); show_mem(); return -ENOMEM; - -error_vma_iter_prealloc: - kmem_cache_free(vm_region_jar, region); - vm_area_free(vma); - pr_warn("Allocation of vma tree for process %d failed\n", current->pid); - show_mem(); - return -ENOMEM; - } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1336,12 +1330,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (!new) goto err_vma_dup; - if (vma_iter_prealloc(vmi)) { - pr_warn("Allocation of vma tree for process %d failed\n", - current->pid); - goto err_vmi_preallocate; - } - /* most fields are the same, copy all, and then fixup */ *region = *vma->vm_region; new->vm_region = region; @@ -1355,6 +1343,13 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, region->vm_pgoff = new->vm_pgoff += npages; } + vma_iter_config(vmi, new->vm_start, new->vm_end); + if (vma_iter_prealloc(vmi, vma)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_vmi_preallocate; + } + if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); From a7496ad529dfd96e37219849bddcda121d133536 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:53 -0400 Subject: [PATCH 213/489] maple_tree: move mas_wr_end_piv() below mas_wr_extend_null() Relocate it and call mas_wr_extend_null() from within mas_wr_end_piv(). Extending the NULL may affect the end pivot value so call mas_wr_endtend_null() from within mas_wr_end_piv() to keep it all together. Link: https://lkml.kernel.org/r/20230724183157.3939892-12-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 494f884ef17fc4..db61cdd8a649ff 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4180,18 +4180,6 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) return true; } -static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) -{ - while ((wr_mas->offset_end < wr_mas->node_end) && - (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) - wr_mas->offset_end++; - - if (wr_mas->offset_end < wr_mas->node_end) - wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; - else - wr_mas->end_piv = wr_mas->mas->max; -} - static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -4228,6 +4216,21 @@ static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) } } +static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) +{ + while ((wr_mas->offset_end < wr_mas->node_end) && + (wr_mas->mas->last > wr_mas->pivots[wr_mas->offset_end])) + wr_mas->offset_end++; + + if (wr_mas->offset_end < wr_mas->node_end) + wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; + else + wr_mas->end_piv = wr_mas->mas->max; + + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); +} + static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; @@ -4371,10 +4374,6 @@ static inline void *mas_wr_store_entry(struct ma_wr_state *wr_mas) /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); - - if (!wr_mas->entry) - mas_wr_extend_null(wr_mas); - /* New root for a single pointer */ if (unlikely(!mas->index && mas->last == ULONG_MAX)) { mas_new_root(mas, wr_mas->entry); From 0b8bb544b1a7051c9072c26ca03a7840fb9573ad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:54 -0400 Subject: [PATCH 214/489] maple_tree: update mas_preallocate() testing Since the mas_preallocate() calculation has been updated to be more precise, the testing must also be updated to check for what is expected. Link: https://lkml.kernel.org/r/20230724183157.3939892-13-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index c6c1c5109deb91..e5da1cad70baf6 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -35458,6 +35458,8 @@ static noinline void __init check_prealloc(struct maple_tree *mt) for (i = 0; i <= max; i++) mtree_test_store_range(mt, i * 10, i * 10 + 5, &i); + /* Spanning store */ + mas_set_range(&mas, 470, 500); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -35481,7 +35483,6 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); @@ -35495,7 +35496,6 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); @@ -35509,7 +35509,6 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mn = mas_pop_node(&mas); MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1); @@ -35523,33 +35522,37 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); + /* Slot store does not need allocations */ + mas_set_range(&mas, 6, 9); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); - height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 0); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); + + mas_set_range(&mas, 6, 10); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 1); mas_store_prealloc(&mas, ptr); + MT_BUG_ON(mt, mas_allocated(&mas) != 0); + /* Split */ + mas_set_range(&mas, 54, 54); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); - MT_BUG_ON(mt, allocated == 0); - MT_BUG_ON(mt, allocated != 1 + height * 3); + MT_BUG_ON(mt, allocated != 1 + height * 2); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); mt_set_non_kernel(1); + /* Spanning store */ + mas_set_range(&mas, 1, 100); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -35557,6 +35560,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) mas_destroy(&mas); + /* Spanning store */ MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL) != 0); allocated = mas_allocated(&mas); height = mas_mt_height(&mas); @@ -35564,6 +35568,7 @@ static noinline void __init check_prealloc(struct maple_tree *mt) MT_BUG_ON(mt, allocated != 1 + height * 3); mas_store_prealloc(&mas, ptr); MT_BUG_ON(mt, mas_allocated(&mas) != 0); + mas_set_range(&mas, 0, 200); mt_set_non_kernel(1); MT_BUG_ON(mt, mas_preallocate(&mas, ptr, GFP_KERNEL & GFP_NOWAIT) == 0); allocated = mas_allocated(&mas); From 17983dc617837a588a52848ab4034d8efa6c1fa6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:55 -0400 Subject: [PATCH 215/489] maple_tree: refine mas_preallocate() node calculations Calculate the number of nodes based on the pending write action instead of assuming the worst case. This addresses a performance regression introduced in platforms that have longer allocation timing. Link: https://lkml.kernel.org/r/20230724183157.3939892-14-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index db61cdd8a649ff..4a111785360fef 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5535,9 +5535,51 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); */ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { + MA_WR_STATE(wr_mas, mas, entry); + unsigned char node_size; + int request = 1; int ret; - mas_node_count_gfp(mas, 1 + mas_mt_height(mas) * 3, gfp); + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + goto ask_now; + + mas_wr_store_setup(&wr_mas); + wr_mas.content = mas_start(mas); + /* Root expand */ + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) + goto ask_now; + + if (unlikely(!mas_wr_walk(&wr_mas))) { + /* Spanning store, use worst case for now */ + request = 1 + mas_mt_height(mas) * 3; + goto ask_now; + } + + /* At this point, we are at the leaf node that needs to be altered. */ + /* Exact fit, no nodes needed. */ + if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) + return 0; + + mas_wr_end_piv(&wr_mas); + node_size = mas_wr_new_end(&wr_mas); + if (node_size >= mt_slots[wr_mas.type]) { + /* Split, worst case for now. */ + request = 1 + mas_mt_height(mas) * 2; + goto ask_now; + } + + /* New root needs a singe node */ + if (unlikely(mte_is_root(mas->node))) + goto ask_now; + + /* Potential spanning rebalance collapsing a node, use worst-case */ + if (node_size - 1 <= mt_min_slots[wr_mas.type]) + request = mas_mt_height(mas) * 2 - 1; + + /* node store, slot store needs one node */ +ask_now: + mas_node_count_gfp(mas, request, gfp); mas->mas_flags |= MA_STATE_PREALLOC; if (likely(!mas_is_err(mas))) return 0; From fec29364348fec535c55708b1f4025b321aba572 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:56 -0400 Subject: [PATCH 216/489] maple_tree: reduce resets during store setup mas_prealloc() may walk partially down the tree before finding that a split or spanning store is needed. When the write occurs, relax the logic on resetting the walk so that partial walks will not restart, but walks that have gone too far (a store that affects beyond the current node) should be restarted. Link: https://lkml.kernel.org/r/20230724183157.3939892-15-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4a111785360fef..a3d602cfd03029 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5424,19 +5424,34 @@ static inline void mte_destroy_walk(struct maple_enode *enode, static void mas_wr_store_setup(struct ma_wr_state *wr_mas) { + if (mas_is_start(wr_mas->mas)) + return; + if (unlikely(mas_is_paused(wr_mas->mas))) - mas_reset(wr_mas->mas); + goto reset; - if (!mas_is_start(wr_mas->mas)) { - if (mas_is_none(wr_mas->mas)) { - mas_reset(wr_mas->mas); - } else { - wr_mas->r_max = wr_mas->mas->max; - wr_mas->type = mte_node_type(wr_mas->mas->node); - if (mas_is_span_wr(wr_mas)) - mas_reset(wr_mas->mas); - } - } + if (unlikely(mas_is_none(wr_mas->mas))) + goto reset; + + /* + * A less strict version of mas_is_span_wr() where we allow spanning + * writes within this node. This is to stop partial walks in + * mas_prealloc() from being reset. + */ + if (wr_mas->mas->last > wr_mas->mas->max) + goto reset; + + if (wr_mas->entry) + return; + + if (mte_is_leaf(wr_mas->mas->node) && + wr_mas->mas->last == wr_mas->mas->max) + goto reset; + + return; + +reset: + mas_reset(wr_mas->mas); } /* Interface */ From 6935e052557caaa8e1ee0a7d85faeb55853d2e0e Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Mon, 24 Jul 2023 14:31:57 -0400 Subject: [PATCH 217/489] mm/mmap: change vma iteration order in do_vmi_align_munmap() By delaying the setting of prev/next VMA until after the write of NULL, the probability of the prev/next VMA already being in the CPU cache is significantly increased, especially for larger munmap operations. It also means that prev/next will be loaded closer to when they are used. This requires changing the loop type when gathering the VMAs that will be freed. Since prev will be set later in the function, it is better to reverse the splitting direction of the start VMA (modify the new_below argument to __split_vma). Using the vma_iter_prev_range() to walk back to the correct location in the tree will, on the most part, mean walking within the CPU cache. Usually, this is two steps vs a node reset and a tree re-walk. Link: https://lkml.kernel.org/r/20230724183157.3939892-16-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Peng Zhang Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 3f10e708ba7245..bc91d91261ab79 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2452,20 +2452,17 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; - error = __split_vma(vmi, vma, start, 0); + error = __split_vma(vmi, vma, start, 1); if (error) goto start_split_failed; - - vma = vma_iter_load(vmi); } - prev = vma_prev(vmi); - /* * Detach a range of VMAs from the mm. Using next as a temp variable as * it is always overwritten. */ - for_each_vma_range(*vmi, next, end) { + next = vma; + do { /* Does it split the end? */ if (next->vm_end > end) { error = __split_vma(vmi, next, end, 0); @@ -2501,13 +2498,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, BUG_ON(next->vm_start < start); BUG_ON(next->vm_start > end); #endif - } - - if (vma_iter_end(vmi) > end) - next = vma_iter_load(vmi); - - if (!next) - next = vma_next(vmi); + } for_each_vma_range(*vmi, next, end); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) /* Make sure no VMAs are about to be lost. */ @@ -2528,7 +2519,10 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, BUG_ON(count != test_count); } #endif - vma_iter_set(vmi, start); + + while (vma_iter_addr(vmi) > start) + vma_iter_prev_range(vmi); + error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL); if (error) goto clear_tree_failed; @@ -2539,6 +2533,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (unlock) mmap_write_downgrade(mm); + prev = vma_iter_prev_range(vmi); + next = vma_next(vmi); + if (next) + vma_iter_prev_range(vmi); + /* * We can free page tables without write-locking mmap_lock because VMAs * were isolated before we downgraded mmap_lock. From 284e05920498788c5df1a7dd6424adb426498e1c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:01 +0100 Subject: [PATCH 218/489] mm: remove CONFIG_PER_VMA_LOCK ifdefs Patch series "Handle most file-backed faults under the VMA lock", v3. This patchset adds the ability to handle page faults on parts of files which are already in the page cache without taking the mmap lock. This patch (of 10): Provide lock_vma_under_rcu() when CONFIG_PER_VMA_LOCK is not defined to eliminate ifdefs in the users. Link: https://lkml.kernel.org/r/20230724185410.1124082-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Punit Agrawal Cc: Arjun Roy Cc: Eric Dumazet Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 2 -- arch/powerpc/mm/fault.c | 4 ---- arch/riscv/mm/fault.c | 4 ---- arch/s390/mm/fault.c | 2 -- arch/x86/mm/fault.c | 4 ---- include/linux/mm.h | 6 ++++++ 6 files changed, 6 insertions(+), 16 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 3fe516b325772c..103fcbdc65526f 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -587,7 +587,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); -#ifdef CONFIG_PER_VMA_LOCK if (!(mm_flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -615,7 +614,6 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, return 0; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 5bfdf6ecfa9650..fafce6bdeff0fd 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -469,7 +469,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, if (is_exec) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -501,7 +500,6 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, return user_mode(regs) ? 0 : SIGBUS; lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -551,9 +549,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, mmap_read_unlock(current->mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) return mm_fault_error(regs, address, fault); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 6ea2cce4cc17e1..046732fcb48ca3 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -283,7 +283,6 @@ void handle_page_fault(struct pt_regs *regs) flags |= FAULT_FLAG_WRITE; else if (cause == EXC_INST_PAGE_FAULT) flags |= FAULT_FLAG_INSTRUCTION; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -311,7 +310,6 @@ void handle_page_fault(struct pt_regs *regs) return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, addr, regs); @@ -368,9 +366,7 @@ void handle_page_fault(struct pt_regs *regs) mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (unlikely(fault & VM_FAULT_ERROR)) { tsk->thread.bad_cause = cause; mm_fault_error(regs, addr, fault); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 2f123429a291b7..6f6b9881e55e6c 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -407,7 +407,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) access = VM_WRITE; if (access == VM_WRITE) flags |= FAULT_FLAG_WRITE; -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; vma = lock_vma_under_rcu(mm, address); @@ -432,7 +431,6 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto out; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ mmap_read_lock(mm); gmap = NULL; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e8711b2cafaf70..787da09d24f3fb 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1328,7 +1328,6 @@ void do_user_addr_fault(struct pt_regs *regs, } #endif -#ifdef CONFIG_PER_VMA_LOCK if (!(flags & FAULT_FLAG_USER)) goto lock_mmap; @@ -1358,7 +1357,6 @@ void do_user_addr_fault(struct pt_regs *regs, return; } lock_mmap: -#endif /* CONFIG_PER_VMA_LOCK */ retry: vma = lock_mm_and_find_vma(mm, address, regs); @@ -1418,9 +1416,7 @@ void do_user_addr_fault(struct pt_regs *regs, } mmap_read_unlock(mm); -#ifdef CONFIG_PER_VMA_LOCK done: -#endif if (likely(!(fault & VM_FAULT_ERROR))) return; diff --git a/include/linux/mm.h b/include/linux/mm.h index ded514ee2588dc..21299a0cfbca8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -742,6 +742,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} +static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, + unsigned long address) +{ + return NULL; +} + #endif /* CONFIG_PER_VMA_LOCK */ /* From 350f6bbca1de515cd7519a33661cefc93ea06054 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:02 +0100 Subject: [PATCH 219/489] mm: allow per-VMA locks on file-backed VMAs Remove the TCP layering violation by allowing per-VMA locks on all VMAs. The fault path will immediately fail in handle_mm_fault(). There may be a small performance reduction from this patch as a little unnecessary work will be done on each page fault. See later patches for the improvement. Link: https://lkml.kernel.org/r/20230724185410.1124082-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - include/linux/net_mm.h | 17 ----------------- include/net/tcp.h | 1 - mm/memory.c | 10 +++++----- net/ipv4/tcp.c | 11 ++++------- 5 files changed, 9 insertions(+), 31 deletions(-) delete mode 100644 include/linux/net_mm.h diff --git a/MAINTAINERS b/MAINTAINERS index 53b7ca8044659f..9e4cfcd7998a01 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14829,7 +14829,6 @@ NETWORKING [TCP] M: Eric Dumazet L: netdev@vger.kernel.org S: Maintained -F: include/linux/net_mm.h F: include/linux/tcp.h F: include/net/tcp.h F: include/trace/events/tcp.h diff --git a/include/linux/net_mm.h b/include/linux/net_mm.h deleted file mode 100644 index b298998bd5a071..00000000000000 --- a/include/linux/net_mm.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifdef CONFIG_MMU - -#ifdef CONFIG_INET -extern const struct vm_operations_struct tcp_vm_ops; -static inline bool vma_is_tcp(const struct vm_area_struct *vma) -{ - return vma->vm_ops == &tcp_vm_ops; -} -#else -static inline bool vma_is_tcp(const struct vm_area_struct *vma) -{ - return false; -} -#endif /* CONFIG_INET*/ - -#endif /* CONFIG_MMU */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 0ca972ebd3dd0f..3a818fe1a8a513 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -45,7 +45,6 @@ #include #include #include -#include extern struct inet_hashinfo tcp_hashinfo; diff --git a/mm/memory.c b/mm/memory.c index ed4807deec893b..b2b17c66f87a6b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,7 +77,6 @@ #include #include #include -#include #include @@ -5223,6 +5222,11 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } + if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. @@ -5394,10 +5398,6 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, if (!vma) goto inval; - /* Only anonymous and tcp vmas are supported for now */ - if (!vma_is_anonymous(vma) && !vma_is_tcp(vma)) - goto inval; - if (!vma_start_read(vma)) goto inval; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8ed52e1e3c99a3..b9d49803e77f03 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1739,7 +1739,7 @@ void tcp_update_recv_tstamps(struct sk_buff *skb, } #ifdef CONFIG_MMU -const struct vm_operations_struct tcp_vm_ops = { +static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, @@ -2042,13 +2042,10 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { - struct vm_area_struct *vma = NULL; + struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); -#ifdef CONFIG_PER_VMA_LOCK - vma = lock_vma_under_rcu(mm, address); -#endif if (vma) { - if (!vma_is_tcp(vma)) { + if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } @@ -2058,7 +2055,7 @@ static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, mmap_read_lock(mm); vma = vma_lookup(mm, address); - if (!vma || !vma_is_tcp(vma)) { + if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } From 4ec31152a80d83d74d231d964703a721236244ef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:03 +0100 Subject: [PATCH 220/489] mm: move FAULT_FLAG_VMA_LOCK check from handle_mm_fault() Handle a little more of the page fault path outside the mmap sem. The hugetlb path doesn't need to check whether the VMA is anonymous; the VM_HUGETLB flag is only set on hugetlbfs VMAs. There should be no performance change from the previous commit; this is simply a step to ease bisection of any problems. Link: https://lkml.kernel.org/r/20230724185410.1124082-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/hugetlb.c | 6 ++++++ mm/memory.c | 18 +++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5ef7bccda50ccf..26e87d6cc92f93 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6062,6 +6062,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int need_wait_lock = 0; unsigned long haddr = address & huge_page_mask(h); + /* TODO: Handle faults under the VMA lock */ + if (flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate diff --git a/mm/memory.c b/mm/memory.c index b2b17c66f87a6b..2a5f4883d9a59a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4984,10 +4984,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } /* - * By the time we get here, we already hold the mm semaphore - * - * The mmap_lock may have been released depending on flags and our - * return value. See filemap_fault() and __folio_lock_or_retry(). + * On entry, we hold either the VMA lock or the mmap_lock + * (FAULT_FLAG_VMA_LOCK tells you which). If VM_FAULT_RETRY is set in + * the result, the mmap_lock is not held on exit. See filemap_fault() + * and __folio_lock_or_retry(). */ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -5006,6 +5006,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, p4d_t *p4d; vm_fault_t ret; + if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5222,11 +5227,6 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, goto out; } - if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - /* * Enable the memcg OOM handling for faults triggered in user * space. Kernel faults are handled more gracefully. From c4fd825e188471d4d2796e02729dd029b3b23210 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:04 +0100 Subject: [PATCH 221/489] mm: handle PUD faults under the VMA lock Postpone checking the VMA_LOCK flag until we've attempted to handle faults on PUDs. There's a mild upside to this patch in that we'll allocate the page tables while under the VMA lock rather than the mmap lock, reducing the hold time on the mmap lock, since the retry will find the page tables already populated. The real purpose here is to make a commit that shows we don't call ->huge_fault under the VMA lock. We do now handle setting the accessed bit on a PUD fault under the VMA lock, but that doesn't seem likely to be a measurable difference. Link: https://lkml.kernel.org/r/20230724185410.1124082-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2a5f4883d9a59a..29353d552a3f2b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4859,11 +4859,17 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + struct vm_area_struct *vma = vmf->vma; /* No support for anonymous transparent PUD pages yet */ - if (vma_is_anonymous(vmf->vma)) + if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -4872,21 +4878,26 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) + struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* No support for anonymous transparent PUD pages yet */ - if (vma_is_anonymous(vmf->vma)) + if (vma_is_anonymous(vma)) goto split; - if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - if (vmf->vma->vm_ops->huge_fault) { - ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } split: /* COW or write-notify not handled on PUD level: split pud.*/ - __split_huge_pud(vmf->vma, vmf->pud, vmf->address); + __split_huge_pud(vma, vmf->pud, vmf->address); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ return VM_FAULT_FALLBACK; } @@ -5006,11 +5017,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, p4d_t *p4d; vm_fault_t ret; - if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - pgd = pgd_offset(mm, address); p4d = p4d_alloc(mm, pgd, address); if (!p4d) @@ -5054,6 +5060,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (pud_trans_unstable(vmf.pud)) goto retry_pud; + if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (pmd_none(*vmf.pmd) && hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); From 8f5fd0e1a02020062c52063f15d4e5c426ee3547 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:05 +0100 Subject: [PATCH 222/489] mm: handle some PMD faults under the VMA lock Push the VMA_LOCK check down from __handle_mm_fault() to handle_pte_fault(). Once again, we refuse to call ->huge_fault() with the VMA lock held, but we will wait for a PMD migration entry with the VMA lock held, handle NUMA migration and set the accessed bit. We were already doing this for anonymous VMAs, so it should be safe. Link: https://lkml.kernel.org/r/20230724185410.1124082-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 29353d552a3f2b..932fc628653687 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4821,36 +4821,47 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { - if (vma_is_anonymous(vmf->vma)) + struct vm_area_struct *vma = vmf->vma; + if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - if (vmf->vma->vm_ops->huge_fault) - return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + } return VM_FAULT_FALLBACK; } /* `inline' is required to avoid gcc 4.1.2 build error */ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) { + struct vm_area_struct *vma = vmf->vma; const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; vm_fault_t ret; - if (vma_is_anonymous(vmf->vma)) { + if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) return handle_userfault(vmf, VM_UFFD_WP); return do_huge_pmd_wp_page(vmf); } - if (vmf->vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { - if (vmf->vma->vm_ops->huge_fault) { - ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { + if (vma->vm_ops->huge_fault) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); if (!(ret & VM_FAULT_FALLBACK)) return ret; } } /* COW or write-notify handled on pte level: split pmd. */ - __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); + __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -4921,6 +4932,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may @@ -5060,11 +5076,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (pud_trans_unstable(vmf.pud)) goto retry_pud; - if ((flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vma)) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } - if (pmd_none(*vmf.pmd) && hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); From 0c2e394ab23017303f676e6206a54c54bb0e3681 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:06 +0100 Subject: [PATCH 223/489] mm: move FAULT_FLAG_VMA_LOCK check down in handle_pte_fault() Call do_pte_missing() under the VMA lock ... then immediately retry in do_fault(). Link: https://lkml.kernel.org/r/20230724185410.1124082-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memory.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 932fc628653687..d947d8d9e89113 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4647,6 +4647,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; + if (vmf->flags & FAULT_FLAG_VMA_LOCK){ + vma_end_read(vma); + return VM_FAULT_RETRY; + } + /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ @@ -4932,11 +4937,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may @@ -4969,6 +4969,12 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) return do_pte_missing(vmf); + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { + pte_unmap(vmf->pte); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); From 61a4b8d32025dcabcd78994f887a4b9dff912cf0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:07 +0100 Subject: [PATCH 224/489] mm: move FAULT_FLAG_VMA_LOCK check down from do_fault() Perform the check at the start of do_read_fault(), do_cow_fault() and do_shared_fault() instead. Should be no performance change from the last commit. Link: https://lkml.kernel.org/r/20230724185410.1124082-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memory.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index d947d8d9e89113..23a20b7a483cdb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4533,6 +4533,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) vm_fault_t ret = 0; struct folio *folio; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or @@ -4561,6 +4566,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; @@ -4601,6 +4611,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) vm_fault_t ret, tmp; struct folio *folio; + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vma); + return VM_FAULT_RETRY; + } + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -4647,11 +4662,6 @@ static vm_fault_t do_fault(struct vm_fault *vmf) struct mm_struct *vm_mm = vma->vm_mm; vm_fault_t ret; - if (vmf->flags & FAULT_FLAG_VMA_LOCK){ - vma_end_read(vma); - return VM_FAULT_RETRY; - } - /* * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ From f5617ffeb450f84c57f7eba1a3524a29955d42b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:08 +0100 Subject: [PATCH 225/489] mm: run the fault-around code under the VMA lock The map_pages fs method should be safe to run under the VMA lock instead of the mmap lock. This should have a measurable reduction in contention on the mmap lock. Link: https://lkml.kernel.org/r/20230724185410.1124082-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Suren Baghdasaryan Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Signed-off-by: Andrew Morton --- mm/memory.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 23a20b7a483cdb..52235aa3d665a9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4533,11 +4533,6 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) vm_fault_t ret = 0; struct folio *folio; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or @@ -4549,6 +4544,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf) return ret; } + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; From 4c2f803abb1797e571579adcaf134a727b3ffc48 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:09 +0100 Subject: [PATCH 226/489] mm: handle swap and NUMA PTE faults under the VMA lock Move the FAULT_FLAG_VMA_LOCK check down in handle_pte_fault(). This is probably not a huge win in its own right, but is a nicely separable bit from the next patch. Link: https://lkml.kernel.org/r/20230724185410.1124082-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 52235aa3d665a9..c122adce47b499 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4979,18 +4979,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) return do_pte_missing(vmf); - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { - pte_unmap(vmf->pte); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - if (!pte_present(vmf->orig_pte)) return do_swap_page(vmf); if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { + pte_unmap(vmf->pte); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { From 063e60d806151f3733acabccb62a463d55fac469 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 24 Jul 2023 19:54:10 +0100 Subject: [PATCH 227/489] mm: handle faults that merely update the accessed bit under the VMA lock Move FAULT_FLAG_VMA_LOCK check out of handle_pte_fault(). This should have a significant performance improvement for mmaped files. Write faults (on read-only shared pages) still take the mmap lock as we do not want to audit all the implementations of ->pfn_mkwrite() and ->page_mkwrite(). However write-faults on private mappings are handled under the VMA lock. [willy@infradead.org: address "suspicious RCU usage" warning] Link: https://lkml.kernel.org/r/ZMK7jwpI4uD6tKrF@casper.infradead.org Link: https://lkml.kernel.org/r/20230724185410.1124082-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arjun Roy Cc: Eric Dumazet Cc: Punit Agrawal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c122adce47b499..f06266464208df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3268,6 +3268,11 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) @@ -3290,6 +3295,12 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio) vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + folio_put(folio); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + tmp = do_page_mkwrite(vmf, folio); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { @@ -3431,6 +3442,12 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) return 0; } copy: + if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma->anon_vma) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + vma_end_read(vmf->vma); + return VM_FAULT_RETRY; + } + /* * Ok, we need to copy. Oh, well.. */ @@ -4985,12 +5002,6 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) return do_numa_page(vmf); - if ((vmf->flags & FAULT_FLAG_VMA_LOCK) && !vma_is_anonymous(vmf->vma)) { - pte_unmap(vmf->pte); - vma_end_read(vmf->vma); - return VM_FAULT_RETRY; - } - spin_lock(vmf->ptl); entry = vmf->orig_pte; if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) { From 348ad1606f4c09e3dc28092baac474e10a252471 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:47 +0530 Subject: [PATCH 228/489] mm/hugepage pud: allow arch-specific helper function to check huge page pud support Patch series "Add support for DAX vmemmap optimization for ppc64", v6. This patch series implements changes required to support DAX vmemmap optimization for ppc64. The vmemmap optimization is only enabled with radix MMU translation and 1GB PUD mapping with 64K page size. The patch series also splits the hugetlb vmemmap optimization as a separate Kconfig variable so that architectures can enable DAX vmemmap optimization without enabling hugetlb vmemmap optimization. This should enable architectures like arm64 to enable DAX vmemmap optimization while they can't enable hugetlb vmemmap optimization. More details of the same are in patch "mm/vmemmap optimization: Split hugetlb and devdax vmemmap optimization". With 64K page size for 16384 pages added (1G) we save 14 pages With 4K page size for 262144 pages added (1G) we save 4094 pages With 4K page size for 512 pages added (2M) we save 6 pages This patch (of 13): Architectures like powerpc would like to enable transparent huge page pud support only with radix translation. To support that add has_transparent_pud_hugepage() helper that architectures can override. [aneesh.kumar@linux.ibm.com: use the new has_transparent_pud_hugepage()] Link: https://lkml.kernel.org/r/87tttrvtaj.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/nvdimm/pfn_devs.c | 2 +- include/linux/pgtable.h | 3 +++ mm/debug_vm_pgtable.c | 16 +++++++--------- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index af7d9301520c5b..18ad315581ca17 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -100,7 +100,7 @@ static unsigned long *nd_pfn_supported_alignments(unsigned long *alignments) if (has_transparent_hugepage()) { alignments[1] = HPAGE_PMD_SIZE; - if (IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) + if (has_transparent_pud_hugepage()) alignments[2] = HPAGE_PUD_SIZE; } diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5f36c055794bed..5eb6bdf30c62d9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1505,6 +1505,9 @@ typedef unsigned int pgtbl_mod_mask; #define has_transparent_hugepage() IS_BUILTIN(CONFIG_TRANSPARENT_HUGEPAGE) #endif +#ifndef has_transparent_pud_hugepage +#define has_transparent_pud_hugepage() IS_BUILTIN(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) +#endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index ee119e33fef133..844fdfd687b964 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -302,7 +302,7 @@ static void __init pud_basic_tests(struct pgtable_debug_args *args, int idx) unsigned long val = idx, *ptr = &val; pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD basic (%pGv)\n", ptr); @@ -343,7 +343,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) unsigned long vaddr = args->vaddr; pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; page = (args->pud_pfn != ULONG_MAX) ? pfn_to_page(args->pud_pfn) : NULL; @@ -405,7 +405,7 @@ static void __init pud_leaf_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD leaf\n"); @@ -732,7 +732,7 @@ static void __init pud_devmap_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD devmap\n"); @@ -981,7 +981,7 @@ static void __init pud_thp_tests(struct pgtable_debug_args *args) { pud_t pud; - if (!has_transparent_hugepage()) + if (!has_transparent_pud_hugepage()) return; pr_debug("Validating PUD based THP\n"); @@ -1022,8 +1022,7 @@ static void __init destroy_args(struct pgtable_debug_args *args) /* Free (huge) page */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - has_transparent_hugepage() && + has_transparent_pud_hugepage() && args->pud_pfn != ULONG_MAX) { if (args->is_contiguous_page) { free_contig_range(args->pud_pfn, @@ -1274,8 +1273,7 @@ static int __init init_args(struct pgtable_debug_args *args) * if we fail to allocate (huge) pages. */ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && - IS_ENABLED(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) && - has_transparent_hugepage()) { + has_transparent_pud_hugepage()) { page = debug_vm_pgtable_alloc_huge_page(args, HPAGE_PUD_SHIFT - PAGE_SHIFT); if (page) { From f32928ab6fe5abac5a270b6c0bffc4ce77ee8c42 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:48 +0530 Subject: [PATCH 229/489] mm: change pudp_huge_get_and_clear_full take vm_area_struct as arg We will use this in a later patch to do tlb flush when clearing pud entries on powerpc. This is similar to commit 93a98695f2f9 ("mm: change pmdp_huge_get_and_clear_full take vm_area_struct as arg") Link: https://lkml.kernel.org/r/20230724190759.483013-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 ++-- mm/debug_vm_pgtable.c | 2 +- mm/huge_memory.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 5eb6bdf30c62d9..124427ece5204a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -456,11 +456,11 @@ static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL -static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, +static inline pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, int full) { - return pudp_huge_get_and_clear(mm, address, pudp); + return pudp_huge_get_and_clear(vma->vm_mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 844fdfd687b964..d61eaa075c751d 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -385,7 +385,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args) WARN_ON(!(pud_write(pud) && pud_dirty(pud))); #ifndef __PAGETABLE_PMD_FOLDED - pudp_huge_get_and_clear_full(args->mm, vaddr, args->pudp, 1); + pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1); pud = READ_ONCE(*args->pudp); WARN_ON(!pud_none(pud)); #endif /* __PAGETABLE_PMD_FOLDED */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e0420de0e2e093..e371503f77467f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1981,7 +1981,7 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, if (!ptl) return 0; - pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm); + pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm); tlb_remove_pud_tlb_entry(tlb, pud, addr); if (vma_is_special_huge(vma)) { spin_unlock(ptl); From c1a6c536fb088c01d6bdce77731d89ad5e1734c6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:49 +0530 Subject: [PATCH 230/489] mm/vmemmap: improve vmemmap_can_optimize and allow architectures to override dax vmemmap optimization requires a minimum of 2 PAGE_SIZE area within vmemmap such that tail page mapping can point to the second PAGE_SIZE area. Enforce that in vmemmap_can_optimize() function. Architectures like powerpc also want to enable vmemmap optimization conditionally (only with radix MMU translation). Hence allow architecture override. Link: https://lkml.kernel.org/r/20230724190759.483013-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +++++++++++++++++++++++---- mm/mm_init.c | 2 +- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 21299a0cfbca8a..d4ce73c20dcc31 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3632,13 +3632,32 @@ void vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap); #endif +#define VMEMMAP_RESERVE_NR 2 #ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP -static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, - struct dev_pagemap *pgmap) +static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, + struct dev_pagemap *pgmap) { - return is_power_of_2(sizeof(struct page)) && - pgmap && (pgmap_vmemmap_nr(pgmap) > 1) && !altmap; + unsigned long nr_pages; + unsigned long nr_vmemmap_pages; + + if (!pgmap || !is_power_of_2(sizeof(struct page))) + return false; + + nr_pages = pgmap_vmemmap_nr(pgmap); + nr_vmemmap_pages = ((nr_pages * sizeof(struct page)) >> PAGE_SHIFT); + /* + * For vmemmap optimization with DAX we need minimum 2 vmemmap + * pages. See layout diagram in Documentation/mm/vmemmap_dedup.rst + */ + return !altmap && (nr_vmemmap_pages > VMEMMAP_RESERVE_NR); } +/* + * If we don't have an architecture override, use the generic rule + */ +#ifndef vmemmap_can_optimize +#define vmemmap_can_optimize __vmemmap_can_optimize +#endif + #else static inline bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) diff --git a/mm/mm_init.c b/mm/mm_init.c index acb0ac19467255..641c56fd08a286 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1020,7 +1020,7 @@ static inline unsigned long compound_nr_pages(struct vmem_altmap *altmap, if (!vmemmap_can_optimize(altmap, pgmap)) return pgmap_vmemmap_nr(pgmap); - return 2 * (PAGE_SIZE / sizeof(struct page)); + return VMEMMAP_RESERVE_NR * (PAGE_SIZE / sizeof(struct page)); } static void __ref memmap_init_compound(struct page *head, From 40135fc7188cee4ae64777148fd462f4ea523181 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:50 +0530 Subject: [PATCH 231/489] mm/vmemmap: allow architectures to override how vmemmap optimization works Architectures like powerpc will like to use different page table allocators and mapping mechanisms to implement vmemmap optimization. Similar to vmemmap_populate allow architectures to implement vmemap_populate_compound_pages Link: https://lkml.kernel.org/r/20230724190759.483013-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/sparse-vmemmap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a044a130405b29..a2cbe44c48e10f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -358,6 +358,7 @@ int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end, return 0; } +#ifndef vmemmap_populate_compound_pages /* * For compound pages bigger than section size (e.g. x86 1G compound * pages with 2M subsection size) fill the rest of sections as tail @@ -446,6 +447,8 @@ static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, return 0; } +#endif + struct page * __meminit __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap) From 973bf6800cf37802ca48292378d574f2a689de9b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:51 +0530 Subject: [PATCH 232/489] mm: add pud_same similar to __HAVE_ARCH_P4D_SAME This helps architectures to override pmd_same and pud_same independently. Link: https://lkml.kernel.org/r/20230724190759.483013-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 124427ece5204a..0af8bc4ce258cc 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -699,11 +699,14 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } +#endif +#ifndef pud_same static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } +#define pud_same pud_same #endif #ifndef __HAVE_ARCH_P4D_SAME From 54a948a1e97a9d19a0a4bed63a2d4caef30c5d17 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:52 +0530 Subject: [PATCH 233/489] mm/huge pud: use transparent huge pud helpers only with CONFIG_TRANSPARENT_HUGEPAGE pudp_set_wrprotect and move_huge_pud helpers are only used when CONFIG_TRANSPARENT_HUGEPAGE is enabled. Similar to pmdp_set_wrprotect and move_huge_pmd_helpers use architecture override only if CONFIG_TRANSPARENT_HUGEPAGE is set Link: https://lkml.kernel.org/r/20230724190759.483013-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 2 ++ mm/mremap.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 0af8bc4ce258cc..f34e0f2cb4d847 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -564,6 +564,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { @@ -577,6 +578,7 @@ static inline void pudp_set_wrprotect(struct mm_struct *mm, { BUILD_BUG(); } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif diff --git a/mm/mremap.c b/mm/mremap.c index 11e06e4ab33be2..056478c106eec6 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -349,7 +349,7 @@ static inline bool move_normal_pud(struct vm_area_struct *vma, } #endif -#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) { From 0b6f15824cc7e431a9706c78bfb9cb3011477ad3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:53 +0530 Subject: [PATCH 234/489] mm/vmemmap optimization: split hugetlb and devdax vmemmap optimization Arm disabled hugetlb vmemmap optimization [1] because hugetlb vmemmap optimization includes an update of both the permissions (writeable to read-only) and the output address (pfn) of the vmemmap ptes. That is not supported without unmapping of pte(marking it invalid) by some architectures. With DAX vmemmap optimization we don't require such pte updates and architectures can enable DAX vmemmap optimization while having hugetlb vmemmap optimization disabled. Hence split DAX optimization support into a different config. s390, loongarch and riscv don't have devdax support. So the DAX config is not enabled for them. With this change, arm64 should be able to select DAX optimization [1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP") Link: https://lkml.kernel.org/r/20230724190759.483013-8-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/loongarch/Kconfig | 2 +- arch/riscv/Kconfig | 2 +- arch/s390/Kconfig | 2 +- arch/x86/Kconfig | 3 ++- fs/Kconfig | 2 +- include/linux/mm.h | 2 +- mm/Kconfig | 5 ++++- 7 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index e71d5bf2cee0fb..dc56ddf9ba0374 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -60,7 +60,7 @@ config LOONGARCH select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_NO_INSTR select BUILDTIME_TABLE_SORT select COMMON_CLK diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 4c07b9189c867b..6943d34c1ec1f6 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -53,7 +53,7 @@ config RISCV select ARCH_WANT_GENERAL_HUGETLB if !RISCV_ISA_SVNAPOT select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select ARCH_WANT_LD_ORPHAN_WARN if !XIP_KERNEL - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select ARCH_WANTS_THP_SWAP if HAVE_ARCH_TRANSPARENT_HUGEPAGE select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 290b6f93b81628..8ff6d1c21e381c 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -127,7 +127,7 @@ config S390 select ARCH_WANTS_NO_INSTR select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_WANT_OPTIMIZE_VMEMMAP + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP select BUILDTIME_TABLE_SORT select CLONE_BACKWARDS2 select DMA_OPS if PCI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7422db4097701c..78224aa7640986 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -128,7 +128,8 @@ config X86 select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN - select ARCH_WANT_OPTIMIZE_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if X86_64 + select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select BUILDTIME_TABLE_SORT diff --git a/fs/Kconfig b/fs/Kconfig index 19975b104bc36d..f3be721bab6d31 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -254,7 +254,7 @@ config HUGETLB_PAGE config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on ARCH_WANT_OPTIMIZE_VMEMMAP + depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON diff --git a/include/linux/mm.h b/include/linux/mm.h index d4ce73c20dcc31..b2520dd555f911 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3633,7 +3633,7 @@ void vmemmap_free(unsigned long start, unsigned long end, #endif #define VMEMMAP_RESERVE_NR 2 -#ifdef CONFIG_ARCH_WANT_OPTIMIZE_VMEMMAP +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP static inline bool __vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) { diff --git a/mm/Kconfig b/mm/Kconfig index 22acffd9009dfd..1959d048bbf560 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -487,7 +487,10 @@ config SPARSEMEM_VMEMMAP # Select this config option from the architecture Kconfig, if it is preferred # to enable the feature of HugeTLB/dev_dax vmemmap optimization. # -config ARCH_WANT_OPTIMIZE_VMEMMAP +config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP + bool + +config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP bool config HAVE_MEMBLOCK_PHYS_MAP From 104c49d5b6dc8b38aea0cb7300068b378cf37ac1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:54 +0530 Subject: [PATCH 235/489] powerpc/mm/trace: convert trace event to trace event class A follow-up patch will add a pud variant for this same event. Using event class makes that addition simpler. No functional change in this patch. Link: https://lkml.kernel.org/r/20230724190759.483013-9-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: Christophe Leroy Cc: Catalin Marinas Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/hash_pgtable.c | 2 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- include/trace/events/thp.h | 23 ++++++++++++++++------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 51f48984abca98..988948d69bc197 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -214,7 +214,7 @@ unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr old = be64_to_cpu(old_be); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); if (old & H_PAGE_HASHPTE) hpte_do_hugepage_flush(mm, addr, pmdp, old); return old; diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index e7ea492ac510a8..02e185d2e4d6eb 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -962,7 +962,7 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add #endif old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1); - trace_hugepage_update(addr, old, clr, set); + trace_hugepage_update_pmd(addr, old, clr, set); return old; } diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index 202b3e3e67ff2a..a95c78b105617f 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -8,25 +8,29 @@ #include #include -TRACE_EVENT(hugepage_set_pmd, +DECLARE_EVENT_CLASS(hugepage_set, - TP_PROTO(unsigned long addr, unsigned long pmd), - TP_ARGS(addr, pmd), + TP_PROTO(unsigned long addr, unsigned long pte), + TP_ARGS(addr, pte), TP_STRUCT__entry( __field(unsigned long, addr) - __field(unsigned long, pmd) + __field(unsigned long, pte) ), TP_fast_assign( __entry->addr = addr; - __entry->pmd = pmd; + __entry->pte = pte; ), - TP_printk("Set pmd with 0x%lx with 0x%lx", __entry->addr, __entry->pmd) + TP_printk("Set page table entry with 0x%lx with 0x%lx", __entry->addr, __entry->pte) ); +DEFINE_EVENT(hugepage_set, hugepage_set_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd), + TP_ARGS(addr, pmd) +); -TRACE_EVENT(hugepage_update, +DECLARE_EVENT_CLASS(hugepage_update, TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set), TP_ARGS(addr, pte, clr, set), @@ -48,6 +52,11 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); +DEFINE_EVENT(hugepage_update, hugepage_update_pmd, + TP_PROTO(unsigned long addr, unsigned long pmd, unsigned long clr, unsigned long set), + TP_ARGS(addr, pmd, clr, set) +); + DECLARE_EVENT_CLASS(migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), From 27af67f35631ac4b61b5e4455b44c9aee8d2cc4b Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:55 +0530 Subject: [PATCH 236/489] powerpc/book3s64/mm: enable transparent pud hugepage This is enabled only with radix translation and 1G hugepage size. This will be used with devdax device memory with a namespace alignment of 1G. Anon transparent hugepage is not supported even though we do have helpers checking pud_trans_huge(). We should never find that return true. The only expected pte bit combination is _PAGE_PTE | _PAGE_DEVMAP. Some of the helpers are never expected to get called on hash translation and hence is marked to call BUG() in such a case. Link: https://lkml.kernel.org/r/20230724190759.483013-10-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/hash.h | 9 + arch/powerpc/include/asm/book3s/64/pgtable.h | 155 ++++++++++++++++-- arch/powerpc/include/asm/book3s/64/radix.h | 36 ++++ .../include/asm/book3s/64/tlbflush-radix.h | 2 + arch/powerpc/include/asm/book3s/64/tlbflush.h | 8 + arch/powerpc/mm/book3s64/pgtable.c | 78 +++++++++ arch/powerpc/mm/book3s64/radix_pgtable.c | 28 ++++ arch/powerpc/mm/book3s64/radix_tlb.c | 7 + arch/powerpc/platforms/Kconfig.cputype | 1 + include/trace/events/thp.h | 10 ++ 10 files changed, 323 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index d4a19e6547acf6..6e70ae51163189 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -138,7 +138,16 @@ static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) } #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) + +/* + * pud comparison that will work with both pte and page table pointer. + */ +static inline int hash__pud_same(pud_t pud_a, pud_t pud_b) +{ + return (((pud_raw(pud_a) ^ pud_raw(pud_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); +} #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) + static inline int hash__p4d_bad(p4d_t p4d) { return (p4d_val(p4d) == 0); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 4acc9690f59996..a8204566cfd0f3 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -921,8 +921,29 @@ static inline pud_t pte_pud(pte_t pte) { return __pud_raw(pte_raw(pte)); } + +static inline pte_t *pudp_ptep(pud_t *pud) +{ + return (pte_t *)pud; +} + +#define pud_pfn(pud) pte_pfn(pud_pte(pud)) +#define pud_dirty(pud) pte_dirty(pud_pte(pud)) +#define pud_young(pud) pte_young(pud_pte(pud)) +#define pud_mkold(pud) pte_pud(pte_mkold(pud_pte(pud))) +#define pud_wrprotect(pud) pte_pud(pte_wrprotect(pud_pte(pud))) +#define pud_mkdirty(pud) pte_pud(pte_mkdirty(pud_pte(pud))) +#define pud_mkclean(pud) pte_pud(pte_mkclean(pud_pte(pud))) +#define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) +#define pud_mkwrite(pud) pte_pud(pte_mkwrite(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY +#define pud_soft_dirty(pmd) pte_soft_dirty(pud_pte(pud)) +#define pud_mksoft_dirty(pmd) pte_pud(pte_mksoft_dirty(pud_pte(pud))) +#define pud_clear_soft_dirty(pmd) pte_pud(pte_clear_soft_dirty(pud_pte(pud))) +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + static inline int pud_bad(pud_t pud) { if (radix_enabled()) @@ -1115,15 +1136,24 @@ static inline bool pmd_access_permitted(pmd_t pmd, bool write) #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot); +extern pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot); extern pmd_t mk_pmd(struct page *page, pgprot_t pgprot); extern pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); +extern void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { } +static inline void update_mmu_cache_pud(struct vm_area_struct *vma, + unsigned long addr, pud_t *pud) +{ +} + extern int hash__has_transparent_hugepage(void); static inline int has_transparent_hugepage(void) { @@ -1133,6 +1163,14 @@ static inline int has_transparent_hugepage(void) } #define has_transparent_hugepage has_transparent_hugepage +static inline int has_transparent_pud_hugepage(void) +{ + if (radix_enabled()) + return radix__has_transparent_pud_hugepage(); + return 0; +} +#define has_transparent_pud_hugepage has_transparent_pud_hugepage + static inline unsigned long pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set) @@ -1142,6 +1180,16 @@ pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, return hash__pmd_hugepage_update(mm, addr, pmdp, clr, set); } +static inline unsigned long +pud_hugepage_update(struct mm_struct *mm, unsigned long addr, pud_t *pudp, + unsigned long clr, unsigned long set) +{ + if (radix_enabled()) + return radix__pud_hugepage_update(mm, addr, pudp, clr, set); + BUG(); + return pud_val(*pudp); +} + /* * returns true for pmd migration entries, THP, devmap, hugetlb * But compile time dependent on THP config @@ -1151,6 +1199,11 @@ static inline int pmd_large(pmd_t pmd) return !!(pmd_raw(pmd) & cpu_to_be64(_PAGE_PTE)); } +static inline int pud_large(pud_t pud) +{ + return !!(pud_raw(pud) & cpu_to_be64(_PAGE_PTE)); +} + /* * For radix we should always find H_PAGE_HASHPTE zero. Hence * the below will work for radix too @@ -1166,6 +1219,17 @@ static inline int __pmdp_test_and_clear_young(struct mm_struct *mm, return ((old & _PAGE_ACCESSED) != 0); } +static inline int __pudp_test_and_clear_young(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + unsigned long old; + + if ((pud_raw(*pudp) & cpu_to_be64(_PAGE_ACCESSED | H_PAGE_HASHPTE)) == 0) + return 0; + old = pud_hugepage_update(mm, addr, pudp, _PAGE_ACCESSED, 0); + return ((old & _PAGE_ACCESSED) != 0); +} + #define __HAVE_ARCH_PMDP_SET_WRPROTECT static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) @@ -1174,6 +1238,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_hugepage_update(mm, addr, pmdp, _PAGE_WRITE, 0); } +#define __HAVE_ARCH_PUDP_SET_WRPROTECT +static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pud_t *pudp) +{ + if (pud_write(*pudp)) + pud_hugepage_update(mm, addr, pudp, _PAGE_WRITE, 0); +} + /* * Only returns true for a THP. False for pmd migration entry. * We also need to return true when we come across a pte that @@ -1195,6 +1267,17 @@ static inline int pmd_trans_huge(pmd_t pmd) return hash__pmd_trans_huge(pmd); } +static inline int pud_trans_huge(pud_t pud) +{ + if (!pud_present(pud)) + return false; + + if (radix_enabled()) + return radix__pud_trans_huge(pud); + return 0; +} + + #define __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { @@ -1203,6 +1286,15 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) return hash__pmd_same(pmd_a, pmd_b); } +#define pud_same pud_same +static inline int pud_same(pud_t pud_a, pud_t pud_b) +{ + if (radix_enabled()) + return radix__pud_same(pud_a, pud_b); + return hash__pud_same(pud_a, pud_b); +} + + static inline pmd_t __pmd_mkhuge(pmd_t pmd) { if (radix_enabled()) @@ -1210,6 +1302,14 @@ static inline pmd_t __pmd_mkhuge(pmd_t pmd) return hash__pmd_mkhuge(pmd); } +static inline pud_t __pud_mkhuge(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkhuge(pud); + BUG(); + return pud; +} + /* * pfn_pmd return a pmd_t that can be used as pmd pte entry. */ @@ -1225,14 +1325,34 @@ static inline pmd_t pmd_mkhuge(pmd_t pmd) return pmd; } +static inline pud_t pud_mkhuge(pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + if (radix_enabled()) + WARN_ON((pud_raw(pud) & cpu_to_be64(_PAGE_PTE)) == 0); + else + WARN_ON(1); +#endif + return pud; +} + + #define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); +#define __HAVE_ARCH_PUDP_SET_ACCESS_FLAGS +extern int pudp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp, + pud_t entry, int dirty); #define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); +#define __HAVE_ARCH_PUDP_TEST_AND_CLEAR_YOUNG +extern int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp); + #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, @@ -1243,6 +1363,16 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, return hash__pmdp_huge_get_and_clear(mm, addr, pmdp); } +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR +static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + if (radix_enabled()) + return radix__pudp_huge_get_and_clear(mm, addr, pudp); + BUG(); + return *pudp; +} + static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { @@ -1257,6 +1387,11 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp, int full); +#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, + pud_t *pudp, int full); + #define __HAVE_ARCH_PGTABLE_DEPOSIT static inline void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) @@ -1305,6 +1440,14 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) return hash__pmd_mkdevmap(pmd); } +static inline pud_t pud_mkdevmap(pud_t pud) +{ + if (radix_enabled()) + return radix__pud_mkdevmap(pud); + BUG(); + return pud; +} + static inline int pmd_devmap(pmd_t pmd) { return pte_devmap(pmd_pte(pmd)); @@ -1312,7 +1455,7 @@ static inline int pmd_devmap(pmd_t pmd) static inline int pud_devmap(pud_t pud) { - return 0; + return pte_devmap(pud_pte(pud)); } static inline int pgd_devmap(pgd_t pgd) @@ -1321,16 +1464,6 @@ static inline int pgd_devmap(pgd_t pgd) } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline int pud_pfn(pud_t pud) -{ - /* - * Currently all calls to pud_pfn() are gated around a pud_devmap() - * check so this should never be used. If it grows another user we - * want to know about it. - */ - BUILD_BUG(); - return 0; -} #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION pte_t ptep_modify_prot_start(struct vm_area_struct *, unsigned long, pte_t *); void ptep_modify_prot_commit(struct vm_area_struct *, unsigned long, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 686001eda93663..2ef92f36340fc0 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -250,6 +250,10 @@ static inline int radix__pud_bad(pud_t pud) return !!(pud_val(pud) & RADIX_PUD_BAD_BITS); } +static inline int radix__pud_same(pud_t pud_a, pud_t pud_b) +{ + return ((pud_raw(pud_a) ^ pud_raw(pud_b)) == 0); +} static inline int radix__p4d_bad(p4d_t p4d) { @@ -268,9 +272,22 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE); } +static inline int radix__pud_trans_huge(pud_t pud) +{ + return (pud_val(pud) & (_PAGE_PTE | _PAGE_DEVMAP)) == _PAGE_PTE; +} + +static inline pud_t radix__pud_mkhuge(pud_t pud) +{ + return __pud(pud_val(pud) | _PAGE_PTE); +} + extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, unsigned long set); +extern unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set); extern pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, @@ -278,6 +295,9 @@ extern void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp); + static inline int radix__has_transparent_hugepage(void) { /* For radix 2M at PMD level means thp */ @@ -285,6 +305,14 @@ static inline int radix__has_transparent_hugepage(void) return 1; return 0; } + +static inline int radix__has_transparent_pud_hugepage(void) +{ + /* For radix 1G at PUD level means pud hugepage support */ + if (mmu_psize_defs[MMU_PAGE_1G].shift == PUD_SHIFT) + return 1; + return 0; +} #endif static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) @@ -292,9 +320,17 @@ static inline pmd_t radix__pmd_mkdevmap(pmd_t pmd) return __pmd(pmd_val(pmd) | (_PAGE_PTE | _PAGE_DEVMAP)); } +static inline pud_t radix__pud_mkdevmap(pud_t pud) +{ + return __pud(pud_val(pud) | (_PAGE_PTE | _PAGE_DEVMAP)); +} + +struct vmem_altmap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, + int node, struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h index 77797a2a82eb2a..a38542259fab10 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -68,6 +68,8 @@ void radix__flush_tlb_pwc_range_psize(struct mm_struct *mm, unsigned long start, unsigned long end, int psize); extern void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +extern void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); extern void radix__flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end); diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush.h b/arch/powerpc/include/asm/book3s/64/tlbflush.h index dca0477b07093a..1950c1b825b404 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush.h @@ -51,6 +51,14 @@ static inline void flush_pmd_tlb_range(struct vm_area_struct *vma, radix__flush_pmd_tlb_range(vma, start, end); } +#define __HAVE_ARCH_FLUSH_PUD_TLB_RANGE +static inline void flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (radix_enabled()) + radix__flush_pud_tlb_range(vma, start, end); +} + #define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, unsigned long start, diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 85c84e89e3eafc..75b938268b0409 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -64,11 +64,39 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, return changed; } +int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pud_t *pudp, pud_t entry, int dirty) +{ + int changed; +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(vma->vm_mm, pudp)); +#endif + changed = !pud_same(*(pudp), entry); + if (changed) { + /* + * We can use MMU_PAGE_1G here, because only radix + * path look at the psize. + */ + __ptep_set_access_flags(vma, pudp_ptep(pudp), + pud_pte(entry), address, MMU_PAGE_1G); + } + return changed; +} + + int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { return __pmdp_test_and_clear_young(vma->vm_mm, address, pmdp); } + +int pudp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, pud_t *pudp) +{ + return __pudp_test_and_clear_young(vma->vm_mm, address, pudp); +} + /* * set a new huge pmd. We should not be called for updating * an existing pmd entry. That should go via pmd_hugepage_update. @@ -90,6 +118,23 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd)); } +void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ +#ifdef CONFIG_DEBUG_VM + /* + * Make sure hardware valid bit is not set. We don't do + * tlb flush for this update. + */ + + WARN_ON(pte_hw_valid(pud_pte(*pudp))); + assert_spin_locked(pud_lockptr(mm, pudp)); + WARN_ON(!(pud_large(pud))); +#endif + trace_hugepage_set_pud(addr, pud_val(pud)); + return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); +} + static void do_serialize(void *arg) { /* We've taken the IPI, so try to trim the mask while here */ @@ -147,11 +192,35 @@ pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, return pmd; } +pud_t pudp_huge_get_and_clear_full(struct vm_area_struct *vma, + unsigned long addr, pud_t *pudp, int full) +{ + pud_t pud; + + VM_BUG_ON(addr & ~HPAGE_PMD_MASK); + VM_BUG_ON((pud_present(*pudp) && !pud_devmap(*pudp)) || + !pud_present(*pudp)); + pud = pudp_huge_get_and_clear(vma->vm_mm, addr, pudp); + /* + * if it not a fullmm flush, then we can possibly end up converting + * this PMD pte entry to a regular level 0 PTE by a parallel page fault. + * Make sure we flush the tlb in this case. + */ + if (!full) + flush_pud_tlb_range(vma, addr, addr + HPAGE_PUD_SIZE); + return pud; +} + static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) { return __pmd(pmd_val(pmd) | pgprot_val(pgprot)); } +static pud_t pud_set_protbits(pud_t pud, pgprot_t pgprot) +{ + return __pud(pud_val(pud) | pgprot_val(pgprot)); +} + /* * At some point we should be able to get rid of * pmd_mkhuge() and mk_huge_pmd() when we update all the @@ -166,6 +235,15 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot) return __pmd_mkhuge(pmd_set_protbits(__pmd(pmdv), pgprot)); } +pud_t pfn_pud(unsigned long pfn, pgprot_t pgprot) +{ + unsigned long pudv; + + pudv = (pfn << PAGE_SHIFT) & PTE_RPN_MASK; + + return __pud_mkhuge(pud_set_protbits(__pud(pudv), pgprot)); +} + pmd_t mk_pmd(struct page *page, pgprot_t pgprot) { return pfn_pmd(page_to_pfn(page), pgprot); diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 02e185d2e4d6eb..227fea53c2175a 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -967,6 +967,23 @@ unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long add return old; } +unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, unsigned long clr, + unsigned long set) +{ + unsigned long old; + +#ifdef CONFIG_DEBUG_VM + WARN_ON(!pud_devmap(*pudp)); + assert_spin_locked(pud_lockptr(mm, pudp)); +#endif + + old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1); + trace_hugepage_update_pud(addr, old, clr, set); + + return old; +} + pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) @@ -1043,6 +1060,17 @@ pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm, return old_pmd; } +pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm, + unsigned long addr, pud_t *pudp) +{ + pud_t old_pud; + unsigned long old; + + old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0); + old_pud = __pud(old); + return old_pud; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, diff --git a/arch/powerpc/mm/book3s64/radix_tlb.c b/arch/powerpc/mm/book3s64/radix_tlb.c index 06e647ef19d1d8..3020a8b38572bc 100644 --- a/arch/powerpc/mm/book3s64/radix_tlb.c +++ b/arch/powerpc/mm/book3s64/radix_tlb.c @@ -1465,6 +1465,13 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma, } EXPORT_SYMBOL(radix__flush_pmd_tlb_range); +void radix__flush_pud_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_1G); +} +EXPORT_SYMBOL(radix__flush_pud_tlb_range); + void radix__flush_tlb_all(void) { unsigned long rb,prs,r,rs; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 45fd975ef5212a..340b86ef72846e 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -94,6 +94,7 @@ config PPC_BOOK3S_64 select PPC_FPU select PPC_HAVE_PMU_SUPPORT select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD select ARCH_ENABLE_HUGEPAGE_MIGRATION if HUGETLB_PAGE && MIGRATION select ARCH_ENABLE_SPLIT_PMD_PTLOCK select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index a95c78b105617f..f50048af5fcc28 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -30,6 +30,11 @@ DEFINE_EVENT(hugepage_set, hugepage_set_pmd, TP_ARGS(addr, pmd) ); +DEFINE_EVENT(hugepage_set, hugepage_set_pud, + TP_PROTO(unsigned long addr, unsigned long pud), + TP_ARGS(addr, pud) +); + DECLARE_EVENT_CLASS(hugepage_update, TP_PROTO(unsigned long addr, unsigned long pte, unsigned long clr, unsigned long set), @@ -57,6 +62,11 @@ DEFINE_EVENT(hugepage_update, hugepage_update_pmd, TP_ARGS(addr, pmd, clr, set) ); +DEFINE_EVENT(hugepage_update, hugepage_update_pud, + TP_PROTO(unsigned long addr, unsigned long pud, unsigned long clr, unsigned long set), + TP_ARGS(addr, pud, clr, set) +); + DECLARE_EVENT_CLASS(migration_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), From 368a0590d954a659b16ab945328ada0cc10f93a0 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:56 +0530 Subject: [PATCH 237/489] powerpc/book3s64/vmemmap: switch radix to use a different vmemmap handling function This is in preparation to update radix to implement vmemmap optimization for devdax. Below are the rules w.r.t radix vmemmap mapping 1. First try to map things using PMD (2M) 2. With altmap if altmap cross-boundary check returns true, fall back to PAGE_SIZE 3. If we can't allocate PMD_SIZE backing memory for vmemmap, fallback to PAGE_SIZE On removing vmemmap mapping, check if every subsection that is using the vmemmap area is invalid. If found to be invalid, that implies we can safely free the vmemmap area. We don't use the PAGE_UNUSED pattern used by x86 because with 64K page size, we need to do the above check even at the PAGE_SIZE granularity. [aneesh.kumar@linux.ibm.com: fix section mismatch warning] Link: https://lkml.kernel.org/r/87h6pqvu5g.fsf@linux.ibm.com [aneesh.kumar@linux.ibm.com: fix kernel build error] Link: https://lkml.kernel.org/r/877cqkwd20.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-11-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/64/radix.h | 2 + arch/powerpc/include/asm/pgtable.h | 6 + arch/powerpc/mm/book3s64/radix_pgtable.c | 327 +++++++++++++++++++-- arch/powerpc/mm/init_64.c | 26 +- 4 files changed, 329 insertions(+), 32 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 2ef92f36340fc0..f1461289643a90 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -331,6 +331,8 @@ extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long phys); int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap); +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap); extern void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size); diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 445a22987aa3a3..a4893b17705a26 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -157,6 +157,12 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) return (pgtable_t)pmd_page_vaddr(pmd); } +#ifdef CONFIG_PPC64 +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size); +#endif /* CONFIG_PPC64 */ + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_PGTABLE_H */ diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 227fea53c2175a..6d04dd579d039f 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -744,8 +744,58 @@ static void free_pud_table(pud_t *pud_start, p4d_t *p4d) p4d_clear(p4d); } -static void remove_pte_table(pte_t *pte_start, unsigned long addr, - unsigned long end, bool direct) +#ifdef CONFIG_SPARSEMEM_VMEMMAP +static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PMD_SIZE); + + return !vmemmap_populated(start, PMD_SIZE); +} + +static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end) +{ + unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE); + + return !vmemmap_populated(start, PAGE_SIZE); + +} +#endif + +static void __meminit free_vmemmap_pages(struct page *page, + struct vmem_altmap *altmap, + int order) +{ + unsigned int nr_pages = 1 << order; + + if (altmap) { + unsigned long alt_start, alt_end; + unsigned long base_pfn = page_to_pfn(page); + + /* + * with 2M vmemmap mmaping we can have things setup + * such that even though atlmap is specified we never + * used altmap. + */ + alt_start = altmap->base_pfn; + alt_end = altmap->base_pfn + altmap->reserve + altmap->free; + + if (base_pfn >= alt_start && base_pfn < alt_end) { + vmem_altmap_free(altmap, nr_pages); + return; + } + } + + if (PageReserved(page)) { + /* allocated from memblock */ + while (nr_pages--) + free_reserved_page(page++); + } else + free_pages((unsigned long)page_address(page), order); +} + +static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte; @@ -759,24 +809,26 @@ static void remove_pte_table(pte_t *pte_start, unsigned long addr, if (!pte_present(*pte)) continue; - if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { - /* - * The vmemmap_free() and remove_section_mapping() - * codepaths call us with aligned addresses. - */ - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + pages++; } - - pte_clear(&init_mm, addr, pte); - pages++; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_page_is_unused(addr, next)) { + free_vmemmap_pages(pte_page(*pte), altmap, 0); + pte_clear(&init_mm, addr, pte); + } +#endif } if (direct) update_page_count(mmu_virtual_psize, -pages); } static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, - unsigned long end, bool direct) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pte_t *pte_base; @@ -790,18 +842,24 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, continue; if (pmd_is_leaf(*pmd)) { - if (!IS_ALIGNED(addr, PMD_SIZE) || - !IS_ALIGNED(next, PMD_SIZE)) { - WARN_ONCE(1, "%s: unaligned range\n", __func__); - continue; + if (IS_ALIGNED(addr, PMD_SIZE) && + IS_ALIGNED(next, PMD_SIZE)) { + if (!direct) + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + pages++; } - pte_clear(&init_mm, addr, (pte_t *)pmd); - pages++; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + else if (!direct && vmemmap_pmd_is_unused(addr, next)) { + free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE)); + pte_clear(&init_mm, addr, (pte_t *)pmd); + } +#endif continue; } pte_base = (pte_t *)pmd_page_vaddr(*pmd); - remove_pte_table(pte_base, addr, next, direct); + remove_pte_table(pte_base, addr, next, direct, altmap); free_pte_table(pte_base, pmd); } if (direct) @@ -809,7 +867,8 @@ static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr, } static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, - unsigned long end, bool direct) + unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long next, pages = 0; pmd_t *pmd_base; @@ -834,15 +893,16 @@ static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr, } pmd_base = pud_pgtable(*pud); - remove_pmd_table(pmd_base, addr, next, direct); + remove_pmd_table(pmd_base, addr, next, direct, altmap); free_pmd_table(pmd_base, pud); } if (direct) update_page_count(MMU_PAGE_1G, -pages); } -static void __meminit remove_pagetable(unsigned long start, unsigned long end, - bool direct) +static void __meminit +remove_pagetable(unsigned long start, unsigned long end, bool direct, + struct vmem_altmap *altmap) { unsigned long addr, next; pud_t *pud_base; @@ -871,7 +931,7 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, } pud_base = p4d_pgtable(*p4d); - remove_pud_table(pud_base, addr, next, direct); + remove_pud_table(pud_base, addr, next, direct, altmap); free_pud_table(pud_base, p4d); } @@ -894,7 +954,7 @@ int __meminit radix__create_section_mapping(unsigned long start, int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end) { - remove_pagetable(start, end, true); + remove_pagetable(start, end, true, NULL); return 0; } #endif /* CONFIG_MEMORY_HOTPLUG */ @@ -926,10 +986,223 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } +int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, + unsigned long addr, unsigned long next) +{ + int large = pmd_large(*pmdp); + + if (large) + vmemmap_verify(pmdp_ptep(pmdp), node, addr, next); + + return large; +} + +void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node, + unsigned long addr, unsigned long next) +{ + pte_t entry; + pte_t *ptep = pmdp_ptep(pmdp); + + VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, ptep, entry); + asm volatile("ptesync": : :"memory"); + + vmemmap_verify(ptep, node, addr, next); +} + +static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr, + int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pte_t *pte = pte_offset_kernel(pmdp, addr); + + if (pte_none(*pte)) { + pte_t entry; + void *p; + + if (!reuse) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE)) + altmap = NULL; + + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap); + if (!p && altmap) + p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); + if (!p) + return NULL; + } else { + /* + * When a PTE/PMD entry is freed from the init_mm + * there's a free_pages() call to this page allocated + * above. Thus this get_page() is paired with the + * put_page_testzero() on the freeing path. + * This can only called by certain ZONE_DEVICE path, + * and through vmemmap_populate_compound_pages() when + * slab is available. + */ + get_page(reuse); + p = page_to_virt(reuse); + } + + VM_BUG_ON(!PAGE_ALIGNED(addr)); + entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + asm volatile("ptesync": : :"memory"); + } + return pte; +} + +static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node, + unsigned long address) +{ + pud_t *pud; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(p4d_none(*p4dp))) { + if (unlikely(!slab_is_available())) { + pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + p4d_populate(&init_mm, p4dp, pud); + /* go to the pud_offset */ + } else + return pud_alloc(&init_mm, p4dp, address); + } + return pud_offset(p4dp, address); +} + +static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node, + unsigned long address) +{ + pmd_t *pmd; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pud_none(*pudp))) { + if (unlikely(!slab_is_available())) { + pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pud_populate(&init_mm, pudp, pmd); + } else + return pmd_alloc(&init_mm, pudp, address); + } + return pmd_offset(pudp, address); +} + +static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node, + unsigned long address) +{ + pte_t *pte; + + /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */ + if (unlikely(pmd_none(*pmdp))) { + if (unlikely(!slab_is_available())) { + pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0); + pmd_populate(&init_mm, pmdp, pte); + } else + return pte_alloc_kernel(pmdp, address); + } + return pte_offset_kernel(pmdp, address); +} + + + +int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + unsigned long addr; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_none(READ_ONCE(*pmd))) { + void *p; + + /* + * keep it simple by checking addr PMD_SIZE alignment + * and verifying the device boundary condition. + * For us to use a pmd mapping, both addr and pfn should + * be aligned. We skip if addr is not aligned and for + * pfn we hope we have extra area in the altmap that + * can help to find an aligned block. This can result + * in altmap block allocation failures, in which case + * we fallback to RAM for vmemmap allocation. + */ + if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) || + altmap_cross_boundary(altmap, addr, PMD_SIZE))) { + /* + * make sure we don't create altmap mappings + * covering things outside the device. + */ + goto base_mapping; + } + + p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); + if (p) { + vmemmap_set_pmd(pmd, p, node, addr, next); + continue; + } else if (altmap) { + /* + * A vmemmap block allocation can fail due to + * alignment requirements and we trying to align + * things aggressively there by running out of + * space. Try base mapping on failure. + */ + goto base_mapping; + } + } else if (vmemmap_check_pmd(pmd, node, addr, next)) { + /* + * If a huge mapping exist due to early call to + * vmemmap_populate, let's try to use that. + */ + continue; + } +base_mapping: + /* + * Not able allocate higher order memory to back memmap + * or we found a pointer to pte page. Allocate base page + * size vmemmap + */ + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + + pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL); + if (!pte) + return -ENOMEM; + + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + next = addr + PAGE_SIZE; + } + return 0; +} + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - remove_pagetable(start, start + page_size, false); + remove_pagetable(start, start + page_size, true, NULL); +} + +void __ref radix__vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ + remove_pagetable(start, end, false, altmap); } #endif #endif diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index fe1b83020e0df0..af0c9891c38f45 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -92,7 +92,7 @@ static struct page * __meminit vmemmap_subsection_start(unsigned long vmemmap_ad * a page table lookup here because with the hash translation we don't keep * vmemmap details in linux page table. */ -static int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) +int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size) { struct page *start; unsigned long vmemmap_end = vmemmap_addr + vmemmap_map_size; @@ -183,8 +183,8 @@ static __meminit int vmemmap_list_populate(unsigned long phys, return 0; } -static bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, - unsigned long page_size) +bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, + unsigned long page_size) { unsigned long nr_pfn = page_size / sizeof(struct page); unsigned long start_pfn = page_to_pfn((struct page *)start); @@ -204,6 +204,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -303,8 +308,8 @@ static unsigned long vmemmap_list_free(unsigned long start) return vmem_back->phys; } -void __ref vmemmap_free(unsigned long start, unsigned long end, - struct vmem_altmap *altmap) +static void __ref __vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) { unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; unsigned long page_order = get_order(page_size); @@ -362,6 +367,17 @@ void __ref vmemmap_free(unsigned long start, unsigned long end, vmemmap_remove_mapping(start, page_size); } } + +void __ref vmemmap_free(unsigned long start, unsigned long end, + struct vmem_altmap *altmap) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_free(start, end, altmap); +#endif + return __vmemmap_free(start, end, altmap); +} + #endif void register_page_bootmem_memmap(unsigned long section_nr, struct page *start_page, unsigned long size) From f2b79c0d79683d552369bb9a7282c1f0226fc566 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:57 +0530 Subject: [PATCH 238/489] powerpc/book3s64/radix: add support for vmemmap optimization for radix With 2M PMD-level mapping, we require 32 struct pages and a single vmemmap page can contain 1024 struct pages (PAGE_SIZE/sizeof(struct page)). Hence with 64K page size, we don't use vmemmap deduplication for PMD-level mapping. [aneesh.kumar@linux.ibm.com: ppc64: don't include radix headers if CONFIG_PPC_RADIX_MMU=n] Link: https://lkml.kernel.org/r/87zg3jw8km.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-12-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/mm/vmemmap_dedup.rst | 1 + Documentation/powerpc/index.rst | 1 + Documentation/powerpc/vmemmap_dedup.rst | 101 ++++++++++ arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/radix.h | 11 ++ arch/powerpc/mm/book3s64/radix_pgtable.c | 203 +++++++++++++++++++++ 6 files changed, 318 insertions(+) create mode 100644 Documentation/powerpc/vmemmap_dedup.rst diff --git a/Documentation/mm/vmemmap_dedup.rst b/Documentation/mm/vmemmap_dedup.rst index a4b12ff906c4df..c573e08b504334 100644 --- a/Documentation/mm/vmemmap_dedup.rst +++ b/Documentation/mm/vmemmap_dedup.rst @@ -210,6 +210,7 @@ the device (altmap). The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64), PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64). +For powerpc equivalent details see Documentation/powerpc/vmemmap_dedup.rst The differences with HugeTLB are relatively minor. diff --git a/Documentation/powerpc/index.rst b/Documentation/powerpc/index.rst index d33b554ca7ba0d..a5083479845432 100644 --- a/Documentation/powerpc/index.rst +++ b/Documentation/powerpc/index.rst @@ -36,6 +36,7 @@ powerpc ultravisor vas-api vcpudispatch_stats + vmemmap_dedup features diff --git a/Documentation/powerpc/vmemmap_dedup.rst b/Documentation/powerpc/vmemmap_dedup.rst new file mode 100644 index 00000000000000..dc4db59fdf87b4 --- /dev/null +++ b/Documentation/powerpc/vmemmap_dedup.rst @@ -0,0 +1,101 @@ +.. SPDX-License-Identifier: GPL-2.0 + +========== +Device DAX +========== + +The device-dax interface uses the tail deduplication technique explained in +Documentation/mm/vmemmap_dedup.rst + +On powerpc, vmemmap deduplication is only used with radix MMU translation. Also +with a 64K page size, only the devdax namespace with 1G alignment uses vmemmap +deduplication. + +With 2M PMD level mapping, we require 32 struct pages and a single 64K vmemmap +page can contain 1024 struct pages (64K/sizeof(struct page)). Hence there is no +vmemmap deduplication possible. + +With 1G PUD level mapping, we require 16384 struct pages and a single 64K +vmemmap page can contain 1024 struct pages (64K/sizeof(struct page)). Hence we +require 16 64K pages in vmemmap to map the struct page for 1G PUD level mapping. + +Here's how things look like on device-dax after the sections are populated:: + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PUD | +-----------+ | | | + | level | | . | ----------------------+ | | + | mapping | +-----------+ | | + | | | . | ------------------------+ | + | | +-----------+ | + | | | 15 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + + +With 4K page size, 2M PMD level mapping requires 512 struct pages and a single +4K vmemmap page contains 64 struct pages(4K/sizeof(struct page)). Hence we +require 8 4K pages in vmemmap to map the struct page for 2M pmd level mapping. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PMD | +-----------+ | | | + | level | | 5 | ----------------------+ | | + | mapping | +-----------+ | | + | | | 6 | ------------------------+ | + | | +-----------+ | + | | | 7 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ + +With 1G PUD level mapping, we require 262144 struct pages and a single 4K +vmemmap page can contain 64 struct pages (4K/sizeof(struct page)). Hence we +require 4096 4K pages in vmemmap to map the struct pages for 1G PUD level +mapping. + +Here's how things look like on device-dax after the sections are populated:: + + +-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+ + | | | 0 | -------------> | 0 | + | | +-----------+ +-----------+ + | | | 1 | -------------> | 1 | + | | +-----------+ +-----------+ + | | | 2 | ----------------^ ^ ^ ^ ^ ^ + | | +-----------+ | | | | | + | | | 3 | ------------------+ | | | | + | | +-----------+ | | | | + | | | 4 | --------------------+ | | | + | PUD | +-----------+ | | | + | level | | . | ----------------------+ | | + | mapping | +-----------+ | | + | | | . | ------------------------+ | + | | +-----------+ | + | | | 4095 | --------------------------+ + | | +-----------+ + | | + | | + | | + +-----------+ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 9222c138c45702..d0497d13f5b494 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -174,6 +174,7 @@ config PPC select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select ARCH_WANT_LD_ORPHAN_WARN + select ARCH_WANT_OPTIMIZE_DAX_VMEMMAP if PPC_RADIX_MMU select ARCH_WANTS_MODULES_DATA_IN_VMALLOC if PPC_BOOK3S_32 || PPC_8xx select ARCH_WEAK_RELEASE_ACQUIRE select BINFMT_ELF diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index f1461289643a90..357e23a403d341 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -326,6 +326,7 @@ static inline pud_t radix__pud_mkdevmap(pud_t pud) } struct vmem_altmap; +struct dev_pagemap; extern int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); @@ -363,5 +364,15 @@ int radix__remove_section_mapping(unsigned long start, unsigned long end); void radix__kernel_map_pages(struct page *page, int numpages, int enable); +#ifdef CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP +#define vmemmap_can_optimize vmemmap_can_optimize +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap); +#endif + +#define vmemmap_populate_compound_pages vmemmap_populate_compound_pages +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap); #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 6d04dd579d039f..c264e6a3620e84 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -986,6 +986,15 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, return 0; } + +bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap) +{ + if (radix_enabled()) + return __vmemmap_can_optimize(altmap, pgmap); + + return false; +} + int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node, unsigned long addr, unsigned long next) { @@ -1193,6 +1202,200 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in return 0; } +static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node, + struct vmem_altmap *altmap, + struct page *reuse) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return NULL; + radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + return pte; +} + +static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr, + unsigned long pfn_offset, int node) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + unsigned long map_addr; + + /* the second vmemmap page which we use for duplication */ + map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE; + pgd = pgd_offset_k(map_addr); + p4d = p4d_offset(pgd, map_addr); + pud = vmemmap_pud_alloc(p4d, node, map_addr); + if (!pud) + return NULL; + pmd = vmemmap_pmd_alloc(pud, node, map_addr); + if (!pmd) + return NULL; + if (pmd_leaf(*pmd)) + /* + * The second page is mapped as a hugepage due to a nearby request. + * Force our mapping to page size without deduplication + */ + return NULL; + pte = vmemmap_pte_alloc(pmd, node, map_addr); + if (!pte) + return NULL; + /* + * Check if there exist a mapping to the left + */ + if (pte_none(*pte)) { + /* + * Populate the head page vmemmap page. + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL); + if (!pte) + return NULL; + /* + * Populate the tail pages vmemmap page + */ + pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL); + if (!pte) + return NULL; + vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE); + return pte; + } + return pte; +} + +int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn, + unsigned long start, + unsigned long end, int node, + struct dev_pagemap *pgmap) +{ + /* + * we want to map things as base page size mapping so that + * we can save space in vmemmap. We could have huge mapping + * covering out both edges. + */ + unsigned long addr; + unsigned long addr_pfn = start_pfn; + unsigned long next; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + for (addr = start; addr < end; addr = next) { + + pgd = pgd_offset_k(addr); + p4d = p4d_offset(pgd, addr); + pud = vmemmap_pud_alloc(p4d, node, addr); + if (!pud) + return -ENOMEM; + pmd = vmemmap_pmd_alloc(pud, node, addr); + if (!pmd) + return -ENOMEM; + + if (pmd_leaf(READ_ONCE(*pmd))) { + /* existing huge mapping. Skip the range */ + addr_pfn += (PMD_SIZE >> PAGE_SHIFT); + next = pmd_addr_end(addr, end); + continue; + } + pte = vmemmap_pte_alloc(pmd, node, addr); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + /* + * This could be because we already have a compound + * page whose VMEMMAP_RESERVE_NR pages were mapped and + * this request fall in those pages. + */ + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } else { + unsigned long nr_pages = pgmap_vmemmap_nr(pgmap); + unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages); + pte_t *tail_page_pte; + + /* + * if the address is aligned to huge page size it is the + * head mapping. + */ + if (pfn_offset == 0) { + /* Populate the head page vmemmap page */ + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + /* + * Populate the tail pages vmemmap page + * It can fall in different pmd, hence + * vmemmap_populate_address() + */ + pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL); + if (!pte) + return -ENOMEM; + + addr_pfn += 2; + next = addr + 2 * PAGE_SIZE; + continue; + } + /* + * get the 2nd mapping details + * Also create it if that doesn't exist + */ + tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node); + if (!tail_page_pte) { + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + + pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte)); + if (!pte) + return -ENOMEM; + vmemmap_verify(pte, node, addr, addr + PAGE_SIZE); + + addr_pfn += 1; + next = addr + PAGE_SIZE; + continue; + } + } + return 0; +} + + #ifdef CONFIG_MEMORY_HOTPLUG void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { From 601f006fddc66e369fdac7c572f981eafd159dac Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:58 +0530 Subject: [PATCH 239/489] powerpc/book3s64/radix: remove mmu_vmemmap_psize This is not used by radix anymore. [aneesh.kumar@linux.ibm.com: fix kernel build error] Link: https://lkml.kernel.org/r/874jlowd0c.fsf@linux.ibm.com Link: https://lkml.kernel.org/r/20230724190759.483013-13-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 11 ----------- arch/powerpc/mm/init_64.c | 21 ++++++++++++++------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index c264e6a3620e84..955cf6b68a3a62 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -601,17 +601,6 @@ void __init radix__early_init_mmu(void) #else mmu_virtual_psize = MMU_PAGE_4K; #endif - -#ifdef CONFIG_SPARSEMEM_VMEMMAP - /* vmemmap mapping */ - if (mmu_psize_defs[MMU_PAGE_2M].shift) { - /* - * map vmemmap using 2M if available - */ - mmu_vmemmap_psize = MMU_PAGE_2M; - } else - mmu_vmemmap_psize = mmu_virtual_psize; -#endif #endif /* * initialize page table size diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index af0c9891c38f45..f7930360406e1a 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -198,17 +198,12 @@ bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, return false; } -int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, - struct vmem_altmap *altmap) +static int __meminit __vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) { bool altmap_alloc; unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; -#ifdef CONFIG_PPC_BOOK3S_64 - if (radix_enabled()) - return radix__vmemmap_populate(start, end, node, altmap); -#endif - /* Align to the page size of the linear mapping. */ start = ALIGN_DOWN(start, page_size); @@ -277,6 +272,18 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, + struct vmem_altmap *altmap) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (radix_enabled()) + return radix__vmemmap_populate(start, end, node, altmap); +#endif + + return __vmemmap_populate(start, end, node, altmap); +} + #ifdef CONFIG_MEMORY_HOTPLUG static unsigned long vmemmap_list_free(unsigned long start) { From 6be3601517d90b728095d70c14f3a04b9adcb166 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 25 Jul 2023 00:37:59 +0530 Subject: [PATCH 240/489] powerpc/book3s64/radix: add debug message to give more details of vmemmap allocation Add some extra vmemmap pr_debug message that will indicate the type of vmemmap allocations. For ex: with DAX vmemmap optimization we can find the below details: [ 187.166580] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166587] radix-mmu: PAGE_SIZE vmemmap mapping [ 187.166591] radix-mmu: Tail page reuse vmemmap mapping [ 187.166594] radix-mmu: Tail page reuse vmemmap mapping [ 187.166598] radix-mmu: Tail page reuse vmemmap mapping [ 187.166601] radix-mmu: Tail page reuse vmemmap mapping [ 187.166604] radix-mmu: Tail page reuse vmemmap mapping [ 187.166608] radix-mmu: Tail page reuse vmemmap mapping [ 187.166611] radix-mmu: Tail page reuse vmemmap mapping [ 187.166614] radix-mmu: Tail page reuse vmemmap mapping [ 187.166617] radix-mmu: Tail page reuse vmemmap mapping [ 187.166620] radix-mmu: Tail page reuse vmemmap mapping [ 187.166623] radix-mmu: Tail page reuse vmemmap mapping [ 187.166626] radix-mmu: Tail page reuse vmemmap mapping [ 187.166629] radix-mmu: Tail page reuse vmemmap mapping [ 187.166632] radix-mmu: Tail page reuse vmemmap mapping And without vmemmap optimization [ 293.549931] radix-mmu: PMD_SIZE vmemmap mapping [ 293.549984] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550032] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550076] radix-mmu: PMD_SIZE vmemmap mapping [ 293.550117] radix-mmu: PMD_SIZE vmemmap mapping Link: https://lkml.kernel.org/r/20230724190759.483013-14-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dan Williams Cc: Joao Martins Cc: Michael Ellerman Cc: Mike Kravetz Cc: Muchun Song Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/radix_pgtable.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 955cf6b68a3a62..96679018e7fb49 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -1033,6 +1033,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL); if (!p) return NULL; + pr_debug("PAGE_SIZE vmemmap mapping\n"); } else { /* * When a PTE/PMD entry is freed from the init_mm @@ -1045,6 +1046,7 @@ static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long */ get_page(reuse); p = page_to_virt(reuse); + pr_debug("Tail page reuse vmemmap mapping\n"); } VM_BUG_ON(!PAGE_ALIGNED(addr)); @@ -1154,6 +1156,7 @@ int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, in p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { vmemmap_set_pmd(pmd, p, node, addr, next); + pr_debug("PMD_SIZE vmemmap mapping\n"); continue; } else if (altmap) { /* From b8cf32dc6e8c75b712cbf638e0fd210101c22f17 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Tue, 20 Jun 2023 19:46:44 +0000 Subject: [PATCH 241/489] mm: zswap: multiple zpools support Support using multiple zpools of the same type in zswap, for concurrency purposes. A fixed number of 32 zpools is suggested by this commit, which was determined empirically. It can be later changed or made into a config option if needed. On a setup with zswap and zsmalloc, comparing a single zpool to 32 zpools shows improvements in the zsmalloc lock contention, especially on the swap out path. The following shows the perf analysis of the swapout path when 10 workloads are simultaneously reclaiming and refaulting tmpfs pages. There are some improvements on the swap in path as well, but less significant. 1 zpool: |--28.99%--zswap_frontswap_store | | |--8.98%--zpool_map_handle | | | --8.98%--zs_zpool_map | | | --8.95%--zs_map_object | | | --8.38%--_raw_spin_lock | | | --7.39%--queued_spin_lock_slowpath | |--8.82%--zpool_malloc | | | --8.82%--zs_zpool_malloc | | | --8.80%--zs_malloc | | | |--7.21%--_raw_spin_lock | | | | | --6.81%--queued_spin_lock_slowpath 32 zpools: |--16.73%--zswap_frontswap_store | | |--1.81%--zpool_malloc | | | --1.81%--zs_zpool_malloc | | | --1.79%--zs_malloc | | | --0.73%--obj_malloc | |--1.06%--zswap_update_total_size | |--0.59%--zpool_map_handle | | | --0.59%--zs_zpool_map | | | --0.57%--zs_map_object | | | --0.51%--_raw_spin_lock Link: https://lkml.kernel.org/r/20230620194644.3142384-1-yosryahmed@google.com Signed-off-by: Yosry Ahmed Suggested-by: Yu Zhao Acked-by: Chris Li (Google) Reviewed-by: Nhat Pham Tested-by: Nhat Pham Cc: Dan Streetman Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Konrad Rzeszutek Wilk Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 81 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 62195f72bf5614..258e4e17799a02 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -142,6 +142,9 @@ static bool zswap_exclusive_loads_enabled = IS_ENABLED( CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON); module_param_named(exclusive_loads, zswap_exclusive_loads_enabled, bool, 0644); +/* Number of zpools in zswap_pool (empirically determined for scalability) */ +#define ZSWAP_NR_ZPOOLS 32 + /********************************* * data structures **********************************/ @@ -161,7 +164,7 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpool; + struct zpool *zpools[ZSWAP_NR_ZPOOLS]; struct crypto_acomp_ctx __percpu *acomp_ctx; struct kref kref; struct list_head list; @@ -248,7 +251,7 @@ static bool zswap_has_pool; #define zswap_pool_debug(msg, p) \ pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ - zpool_get_type((p)->zpool)) + zpool_get_type((p)->zpools[0])) static int zswap_writeback_entry(struct zswap_entry *entry, struct zswap_tree *tree); @@ -272,11 +275,13 @@ static void zswap_update_total_size(void) { struct zswap_pool *pool; u64 total = 0; + int i; rcu_read_lock(); list_for_each_entry_rcu(pool, &zswap_pools, list) - total += zpool_get_total_size(pool->zpool); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + total += zpool_get_total_size(pool->zpools[i]); rcu_read_unlock(); @@ -365,6 +370,16 @@ static bool zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) return false; } +static struct zpool *zswap_find_zpool(struct zswap_entry *entry) +{ + int i = 0; + + if (ZSWAP_NR_ZPOOLS > 1) + i = hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS)); + + return entry->pool->zpools[i]; +} + /* * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. @@ -381,7 +396,7 @@ static void zswap_free_entry(struct zswap_entry *entry) spin_lock(&entry->pool->lru_lock); list_del(&entry->lru); spin_unlock(&entry->pool->lru_lock); - zpool_free(entry->pool->zpool, entry->handle); + zpool_free(zswap_find_zpool(entry), entry->handle); zswap_pool_put(entry->pool); } zswap_entry_cache_free(entry); @@ -590,7 +605,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) list_for_each_entry_rcu(pool, &zswap_pools, list) { if (strcmp(pool->tfm_name, compressor)) continue; - if (strcmp(zpool_get_type(pool->zpool), type)) + /* all zpools share the same type */ + if (strcmp(zpool_get_type(pool->zpools[0]), type)) continue; /* if we can't get it, it's about to be destroyed */ if (!zswap_pool_get(pool)) @@ -695,6 +711,7 @@ static void shrink_worker(struct work_struct *w) static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { + int i; struct zswap_pool *pool; char name[38]; /* 'zswap' + 32 char (max) num + \0 */ gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; @@ -715,15 +732,18 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (!pool) return NULL; - /* unique name for each pool specifically required by zsmalloc */ - snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + /* unique name for each pool specifically required by zsmalloc */ + snprintf(name, 38, "zswap%x", + atomic_inc_return(&zswap_pools_count)); - pool->zpool = zpool_create_pool(type, name, gfp); - if (!pool->zpool) { - pr_err("%s zpool not available\n", type); - goto error; + pool->zpools[i] = zpool_create_pool(type, name, gfp); + if (!pool->zpools[i]) { + pr_err("%s zpool not available\n", type); + goto error; + } } - pr_debug("using %s zpool\n", zpool_get_type(pool->zpool)); + pr_debug("using %s zpool\n", zpool_get_type(pool->zpools[0])); strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name)); @@ -755,8 +775,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) error: if (pool->acomp_ctx) free_percpu(pool->acomp_ctx); - if (pool->zpool) - zpool_destroy_pool(pool->zpool); + while (i--) + zpool_destroy_pool(pool->zpools[i]); kfree(pool); return NULL; } @@ -805,11 +825,14 @@ static struct zswap_pool *__zswap_pool_create_fallback(void) static void zswap_pool_destroy(struct zswap_pool *pool) { + int i; + zswap_pool_debug("destroying", pool); cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - zpool_destroy_pool(pool->zpool); + for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + zpool_destroy_pool(pool->zpools[i]); kfree(pool); } @@ -1073,7 +1096,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct page *page; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; - struct zpool *pool = entry->pool->zpool; + struct zpool *pool = zswap_find_zpool(entry); u8 *src, *tmp = NULL; unsigned int dlen; @@ -1214,6 +1237,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct crypto_acomp_ctx *acomp_ctx; struct obj_cgroup *objcg = NULL; struct zswap_pool *pool; + struct zpool *zpool; int ret; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; @@ -1324,10 +1348,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ + zpool = zswap_find_zpool(entry); gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; - if (zpool_malloc_support_movable(entry->pool->zpool)) + if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; - ret = zpool_malloc(entry->pool->zpool, dlen, gfp, &handle); + ret = zpool_malloc(zpool, dlen, gfp, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; goto put_dstmem; @@ -1336,9 +1361,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_WO); + buf = zpool_map_handle(zpool, handle, ZPOOL_MM_WO); memcpy(buf, dst, dlen); - zpool_unmap_handle(entry->pool->zpool, handle); + zpool_unmap_handle(zpool, handle); mutex_unlock(acomp_ctx->mutex); /* populate entry */ @@ -1409,6 +1434,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; u8 *src, *dst, *tmp; + struct zpool *zpool; unsigned int dlen; int ret; @@ -1430,7 +1456,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, goto stats; } - if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + zpool = zswap_find_zpool(entry); + if (!zpool_can_sleep_mapped(zpool)) { tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; @@ -1440,12 +1467,12 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + src = zpool_map_handle(zpool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(entry->pool->zpool)) { + if (!zpool_can_sleep_mapped(zpool)) { memcpy(tmp, src, entry->length); src = tmp; - zpool_unmap_handle(entry->pool->zpool, entry->handle); + zpool_unmap_handle(zpool, entry->handle); } acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1457,8 +1484,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); mutex_unlock(acomp_ctx->mutex); - if (zpool_can_sleep_mapped(entry->pool->zpool)) - zpool_unmap_handle(entry->pool->zpool, entry->handle); + if (zpool_can_sleep_mapped(zpool)) + zpool_unmap_handle(zpool, entry->handle); else kfree(tmp); @@ -1619,7 +1646,7 @@ static int zswap_setup(void) pool = __zswap_pool_create_fallback(); if (pool) { pr_info("loaded using pool %s/%s\n", pool->tfm_name, - zpool_get_type(pool->zpool)); + zpool_get_type(pool->zpools[0])); list_add(&pool->list, &zswap_pools); zswap_has_pool = true; } else { From 42c06a0e8ebe95b81e5fb41c6556ff22d9255b0c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 17 Jul 2023 12:02:27 -0400 Subject: [PATCH 242/489] mm: kill frontswap The only user of frontswap is zswap, and has been for a long time. Have swap call into zswap directly and remove the indirection. [hannes@cmpxchg.org: remove obsolete comment, per Yosry] Link: https://lkml.kernel.org/r/20230719142832.GA932528@cmpxchg.org [fengwei.yin@intel.com: don't warn if none swapcache folio is passed to zswap_load] Link: https://lkml.kernel.org/r/20230810095652.3905184-1-fengwei.yin@intel.com Link: https://lkml.kernel.org/r/20230717160227.GA867137@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Yin Fengwei Acked-by: Konrad Rzeszutek Wilk Acked-by: Nhat Pham Acked-by: Yosry Ahmed Acked-by: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Matthew Wilcox (Oracle) Cc: Vitaly Wool Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/zswap.rst | 14 +- Documentation/mm/frontswap.rst | 264 ---------------- Documentation/mm/index.rst | 1 - .../translations/zh_CN/mm/frontswap.rst | 196 ------------ Documentation/translations/zh_CN/mm/index.rst | 1 - MAINTAINERS | 7 - fs/proc/meminfo.c | 1 + include/linux/frontswap.h | 91 ------ include/linux/swap.h | 9 - include/linux/swapfile.h | 5 - include/linux/zswap.h | 37 +++ mm/Kconfig | 4 - mm/Makefile | 1 - mm/frontswap.c | 283 ------------------ mm/page_io.c | 6 +- mm/swapfile.c | 33 +- mm/zswap.c | 159 ++++------ 17 files changed, 121 insertions(+), 991 deletions(-) delete mode 100644 Documentation/mm/frontswap.rst delete mode 100644 Documentation/translations/zh_CN/mm/frontswap.rst delete mode 100644 include/linux/frontswap.h create mode 100644 include/linux/zswap.h delete mode 100644 mm/frontswap.c diff --git a/Documentation/admin-guide/mm/zswap.rst b/Documentation/admin-guide/mm/zswap.rst index c5c2c7dbb15568..45b98390e938d0 100644 --- a/Documentation/admin-guide/mm/zswap.rst +++ b/Documentation/admin-guide/mm/zswap.rst @@ -49,7 +49,7 @@ compressed pool. Design ====== -Zswap receives pages for compression through the Frontswap API and is able to +Zswap receives pages for compression from the swap subsystem and is able to evict pages from its own compressed pool on an LRU basis and write them back to the backing swap device in the case that the compressed pool is full. @@ -70,19 +70,19 @@ means the compression ratio will always be 2:1 or worse (because of half-full zbud pages). The zsmalloc type zpool has a more complex compressed page storage method, and it can achieve greater storage densities. -When a swap page is passed from frontswap to zswap, zswap maintains a mapping +When a swap page is passed from swapout to zswap, zswap maintains a mapping of the swap entry, a combination of the swap type and swap offset, to the zpool handle that references that compressed swap page. This mapping is achieved with a red-black tree per swap type. The swap offset is the search key for the tree nodes. -During a page fault on a PTE that is a swap entry, frontswap calls the zswap -load function to decompress the page into the page allocated by the page fault -handler. +During a page fault on a PTE that is a swap entry, the swapin code calls the +zswap load function to decompress the page into the page allocated by the page +fault handler. Once there are no PTEs referencing a swap page stored in zswap (i.e. the count -in the swap_map goes to 0) the swap code calls the zswap invalidate function, -via frontswap, to free the compressed entry. +in the swap_map goes to 0) the swap code calls the zswap invalidate function +to free the compressed entry. Zswap seeks to be simple in its policies. Sysfs attributes allow for one user controlled policy: diff --git a/Documentation/mm/frontswap.rst b/Documentation/mm/frontswap.rst deleted file mode 100644 index c892412988af26..00000000000000 --- a/Documentation/mm/frontswap.rst +++ /dev/null @@ -1,264 +0,0 @@ -========= -Frontswap -========= - -Frontswap provides a "transcendent memory" interface for swap pages. -In some environments, dramatic performance savings may be obtained because -swapped pages are saved in RAM (or a RAM-like device) instead of a swap disk. - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap is so named because it can be thought of as the opposite of -a "backing" store for a swap device. The storage is assumed to be -a synchronous concurrency-safe page-oriented "pseudo-RAM device" conforming -to the requirements of transcendent memory (such as Xen's "tmem", or -in-kernel compressed memory, aka "zcache", or future RAM-like devices); -this pseudo-RAM device is not directly accessible or addressable by the -kernel and is of unknown and possibly time-varying size. The driver -links itself to frontswap by calling frontswap_register_ops to set the -frontswap_ops funcs appropriately and the functions it provides must -conform to certain policies as follows: - -An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "store" will -copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "load" will copy the page, if found, -from transcendent memory into kernel memory, but will NOT remove the page -from transcendent memory. An "invalidate_page" will remove the page -from transcendent memory and an "invalidate_area" will remove ALL pages -associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further stores with that swap type. - -Once a page is successfully stored, a matching load on the page will normally -succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the store returns -success, the data has been successfully saved to transcendent memory and -a disk write and, if the data is later read back, a disk read are avoided. -If a store returns failure, transcendent memory has rejected the data, and the -page can be written to swap as usual. - -Note that if a page is stored and the page already exists in transcendent memory -(a "duplicate" store), either the store succeeds and the data is overwritten, -or the store fails AND the page is invalidated. This ensures stale data may -never be obtained from frontswap. - -If properly configured, monitoring of frontswap is done via debugfs in -the `/sys/kernel/debug/frontswap` directory. The effectiveness of -frontswap can be measured (across all swap devices) with: - -``failed_stores`` - how many store attempts have failed - -``loads`` - how many loads were attempted (all should succeed) - -``succ_stores`` - how many store attempts have succeeded - -``invalidates`` - how many invalidates were attempted - -A backend implementation may provide additional metrics. - -FAQ -=== - -* Where's the value? - -When a workload starts swapping, performance falls through the floor. -Frontswap significantly increases performance in many such workloads by -providing a clean, dynamic interface to read and write swap pages to -"transcendent memory" that is otherwise not directly addressable to the kernel. -This interface is ideal when data is transformed to a different form -and size (such as with compression) or secretly moved (as might be -useful for write-balancing for some RAM-like devices). Swap pages (and -evicted page-cache pages) are a great use for this kind of slower-than-RAM- -but-much-faster-than-disk "pseudo-RAM device". - -Frontswap with a fairly small impact on the kernel, -provides a huge amount of flexibility for more dynamic, flexible RAM -utilization in various system configurations: - -In the single kernel case, aka "zcache", pages are compressed and -stored in local memory, thus increasing the total anonymous pages -that can be safely kept in RAM. Zcache essentially trades off CPU -cycles used in compression/decompression for better memory utilization. -Benchmarks have shown little or no impact when memory pressure is -low while providing a significant performance improvement (25%+) -on some workloads under high memory pressure. - -"RAMster" builds on zcache by adding "peer-to-peer" transcendent memory -support for clustered systems. Frontswap pages are locally compressed -as in zcache, but then "remotified" to another system's RAM. This -allows RAM to be dynamically load-balanced back-and-forth as needed, -i.e. when system A is overcommitted, it can swap to system B, and -vice versa. RAMster can also be configured as a memory server so -many servers in a cluster can swap, dynamically as needed, to a single -server configured with a large amount of RAM... without pre-configuring -how much of the RAM is available for each of the clients! - -In the virtual case, the whole point of virtualization is to statistically -multiplex physical resources across the varying demands of multiple -virtual machines. This is really hard to do with RAM and efforts to do -it well with no kernel changes have essentially failed (except in some -well-publicized special-case workloads). -Specifically, the Xen Transcendent Memory backend allows otherwise -"fallow" hypervisor-owned RAM to not only be "time-shared" between multiple -virtual machines, but the pages can be compressed and deduplicated to -optimize RAM utilization. And when guest OS's are induced to surrender -underutilized RAM (e.g. with "selfballooning"), sudden unexpected -memory pressure may result in swapping; frontswap allows those pages -to be swapped to and from hypervisor RAM (if overall host system memory -conditions allow), thus mitigating the potentially awful performance impact -of unplanned swapping. - -A KVM implementation is underway and has been RFC'ed to lkml. And, -using frontswap, investigation is also underway on the use of NVM as -a memory extension technology. - -* Sure there may be performance advantages in some situations, but - what's the space/time overhead of frontswap? - -If CONFIG_FRONTSWAP is disabled, every frontswap hook compiles into -nothingness and the only overhead is a few extra bytes per swapon'ed -swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" -registers, there is one extra global variable compared to zero for -every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "store" -request (i.e. provides no memory despite claiming it might), -CPU overhead is still negligible -- and since every frontswap fail -precedes a swap page write-to-disk, the system is highly likely -to be I/O bound and using a small fraction of a percent of a CPU -will be irrelevant anyway. - -As for space, if CONFIG_FRONTSWAP is enabled AND a frontswap backend -registers, one bit is allocated for every swap page for every swap -device that is swapon'd. This is added to the EIGHT bits (which -was sixteen until about 2.6.34) that the kernel already allocates -for every swap page for every swap device that is swapon'd. (Hugh -Dickins has observed that frontswap could probably steal one of -the existing eight bits, but let's worry about that minor optimization -later.) For very large swap disks (which are rare) on a standard -4K pagesize, this is 1MB per 32GB swap. - -When swap pages are stored in transcendent memory instead of written -out to disk, there is a side effect that this may create more memory -pressure that can potentially outweigh the other advantages. A -backend, such as zcache, must implement policies to carefully (but -dynamically) manage memory limits to ensure this doesn't happen. - -* OK, how about a quick overview of what this frontswap patch does - in terms that a kernel hacker can grok? - -Let's assume that a frontswap "backend" has registered during -kernel initialization; this registration indicates that this -frontswap backend has access to some "memory" that is not directly -accessible by the kernel. Exactly how much memory it provides is -entirely dynamic and random. - -Whenever a swap-device is swapon'd frontswap_init() is called, -passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "store" swap pages -associated with that number. - -Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_store is called. Frontswap -consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_store returns -1 and the kernel swaps the page -to the swap device as normal. Note that the response from the frontswap -backend is unpredictable to the kernel; it may choose to never accept a -page, it could accept every ninth page, or it might accept every -page. But if the backend does accept a page, the data from the page -has already been copied and associated with the type and offset, -and the backend guarantees the persistence of the data. In this case, -frontswap sets a bit in the "frontswap_map" for the swap device -corresponding to the page offset on the swap device to which it would -otherwise have written the data. - -When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_load() which checks the frontswap_map to -see if the page was earlier accepted by the frontswap backend. If -it was, the page of data is filled from the frontswap backend and -the swap-in is complete. If not, the normal swap-in code is -executed to obtain the page of data from the real swap device. - -So every time the frontswap backend accepts a page, a swap device read -and (potentially) a swap device write are replaced by a "frontswap backend -store" and (possibly) a "frontswap backend loads", which are presumably much -faster. - -* Can't frontswap be configured as a "special" swap device that is - just higher priority than any real swap device (e.g. like zswap, - or maybe swap-over-nbd/NFS)? - -No. First, the existing swap subsystem doesn't allow for any kind of -swap hierarchy. Perhaps it could be rewritten to accommodate a hierarchy, -but this would require fairly drastic changes. Even if it were -rewritten, the existing swap subsystem uses the block I/O layer which -assumes a swap device is fixed size and any page in it is linearly -addressable. Frontswap barely touches the existing swap subsystem, -and works around the constraints of the block I/O subsystem to provide -a great deal of flexibility and dynamicity. - -For example, the acceptance of any swap page by the frontswap backend is -entirely unpredictable. This is critical to the definition of frontswap -backends because it grants completely dynamic discretion to the -backend. In zcache, one cannot know a priori how compressible a page is. -"Poorly" compressible pages can be rejected, and "poorly" can itself be -defined dynamically depending on current memory constraints. - -Further, frontswap is entirely synchronous whereas a real swap -device is, by definition, asynchronous and uses block I/O. The -block I/O layer is not only unnecessary, but may perform "optimizations" -that are inappropriate for a RAM-oriented device including delaying -the write of some pages for a significant amount of time. Synchrony is -required to ensure the dynamicity of the backend and to avoid thorny race -conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "store" -and "load" operations need be synchronous. A separate asynchronous thread -is free to manipulate the pages stored by frontswap. For example, -the "remotification" thread in RAMster uses standard asynchronous -kernel sockets to move compressed frontswap pages to a remote machine. -Similarly, a KVM guest-side implementation could do in-guest compression -and use "batched" hypercalls. - -In a virtualized environment, the dynamicity allows the hypervisor -(or host OS) to do "intelligent overcommit". For example, it can -choose to accept pages only until host-swapping might be imminent, -then force guests to do their own swapping. - -There is a downside to the transcendent memory specifications for -frontswap: Since any "store" might fail, there must always be a real -slot on a real swap device to swap the page. Thus frontswap must be -implemented as a "shadow" to every swapon'd device with the potential -capability of holding every page that the swap device might have held -and the possibility that it might hold no pages at all. This means -that frontswap cannot contain more pages than the total of swapon'd -swap devices. For example, if NO swap device is configured on some -installation, frontswap is useless. Swapless portable devices -can still use frontswap but a backend for such devices must configure -some kind of "ghost" swap device and ensure that it is never used. - -* Why this weird definition about "duplicate stores"? If a page - has been previously successfully stored, can't it always be - successfully overwritten? - -Nearly always it can, but no, sometimes it cannot. Consider an example -where data is compressed and the original 4K page has been compressed -to 1K. Now an attempt is made to overwrite the page with data that -is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the store must be rejected. Whenever -frontswap rejects a store that would overwrite, it also must invalidate -the old data and ensure that it is no longer accessible. Since the -swap subsystem then writes the new data to the read swap device, -this is the correct course of action to ensure coherency. - -* Why does the frontswap patch create the new include file swapfile.h? - -The frontswap code depends on some swap-subsystem-internal data -structures that have, over the years, moved back and forth between -static and global. This seemed a reasonable compromise: Define -them as global but declare them in a new include file that isn't -included by the large number of source files that include swap.h. - -Dan Magenheimer, last updated April 9, 2012 diff --git a/Documentation/mm/index.rst b/Documentation/mm/index.rst index 5a94a921ea4048..31d2ac3064387b 100644 --- a/Documentation/mm/index.rst +++ b/Documentation/mm/index.rst @@ -44,7 +44,6 @@ above structured documentation, or deleted if it has served its purpose. balance damon/index free_page_reporting - frontswap hmm hwpoison hugetlbfs_reserv diff --git a/Documentation/translations/zh_CN/mm/frontswap.rst b/Documentation/translations/zh_CN/mm/frontswap.rst deleted file mode 100644 index 434975390b480c..00000000000000 --- a/Documentation/translations/zh_CN/mm/frontswap.rst +++ /dev/null @@ -1,196 +0,0 @@ -:Original: Documentation/mm/frontswap.rst - -:翻译: - - 司延腾 Yanteng Si - -:校译: - -========= -Frontswap -========= - -Frontswap为交换页提供了一个 “transcendent memory” 的接口。在一些环境中,由 -于交换页被保存在RAM(或类似RAM的设备)中,而不是交换磁盘,因此可以获得巨大的性能 -节省(提高)。 - -.. _Transcendent memory in a nutshell: https://lwn.net/Articles/454795/ - -Frontswap之所以这么命名,是因为它可以被认为是与swap设备的“back”存储相反。存 -储器被认为是一个同步并发安全的面向页面的“伪RAM设备”,符合transcendent memory -(如Xen的“tmem”,或内核内压缩内存,又称“zcache”,或未来的类似RAM的设备)的要 -求;这个伪RAM设备不能被内核直接访问或寻址,其大小未知且可能随时间变化。驱动程序通过 -调用frontswap_register_ops将自己与frontswap链接起来,以适当地设置frontswap_ops -的功能,它提供的功能必须符合某些策略,如下所示: - -一个 “init” 将设备准备好接收与指定的交换设备编号(又称“类型”)相关的frontswap -交换页。一个 “store” 将把该页复制到transcendent memory,并与该页的类型和偏移 -量相关联。一个 “load” 将把该页,如果找到的话,从transcendent memory复制到内核 -内存,但不会从transcendent memory中删除该页。一个 “invalidate_page” 将从 -transcendent memory中删除该页,一个 “invalidate_area” 将删除所有与交换类型 -相关的页(例如,像swapoff)并通知 “device” 拒绝进一步存储该交换类型。 - -一旦一个页面被成功存储,在该页面上的匹配加载通常会成功。因此,当内核发现自己处于需 -要交换页面的情况时,它首先尝试使用frontswap。如果存储的结果是成功的,那么数据就已 -经成功的保存到了transcendent memory中,并且避免了磁盘写入,如果后来再读回数据, -也避免了磁盘读取。如果存储返回失败,transcendent memory已经拒绝了该数据,且该页 -可以像往常一样被写入交换空间。 - -请注意,如果一个页面被存储,而该页面已经存在于transcendent memory中(一个 “重复” -的存储),要么存储成功,数据被覆盖,要么存储失败,该页面被废止。这确保了旧的数据永远 -不会从frontswap中获得。 - -如果配置正确,对frontswap的监控是通过 `/sys/kernel/debug/frontswap` 目录下的 -debugfs完成的。frontswap的有效性可以通过以下方式测量(在所有交换设备中): - -``failed_stores`` - 有多少次存储的尝试是失败的 - -``loads`` - 尝试了多少次加载(应该全部成功) - -``succ_stores`` - 有多少次存储的尝试是成功的 - -``invalidates`` - 尝试了多少次作废 - -后台实现可以提供额外的指标。 - -经常问到的问题 -============== - -* 价值在哪里? - -当一个工作负载开始交换时,性能就会下降。Frontswap通过提供一个干净的、动态的接口来 -读取和写入交换页到 “transcendent memory”,从而大大增加了许多这样的工作负载的性 -能,否则内核是无法直接寻址的。当数据被转换为不同的形式和大小(比如压缩)或者被秘密 -移动(对于一些类似RAM的设备来说,这可能对写平衡很有用)时,这个接口是理想的。交换 -页(和被驱逐的页面缓存页)是这种比RAM慢但比磁盘快得多的“伪RAM设备”的一大用途。 - -Frontswap对内核的影响相当小,为各种系统配置中更动态、更灵活的RAM利用提供了巨大的 -灵活性: - -在单一内核的情况下,又称“zcache”,页面被压缩并存储在本地内存中,从而增加了可以安 -全保存在RAM中的匿名页面总数。Zcache本质上是用压缩/解压缩的CPU周期换取更好的内存利 -用率。Benchmarks测试显示,当内存压力较低时,几乎没有影响,而在高内存压力下的一些 -工作负载上,则有明显的性能改善(25%以上)。 - -“RAMster” 在zcache的基础上增加了对集群系统的 “peer-to-peer” transcendent memory -的支持。Frontswap页面像zcache一样被本地压缩,但随后被“remotified” 到另一个系 -统的RAM。这使得RAM可以根据需要动态地来回负载平衡,也就是说,当系统A超载时,它可以 -交换到系统B,反之亦然。RAMster也可以被配置成一个内存服务器,因此集群中的许多服务器 -可以根据需要动态地交换到配置有大量内存的单一服务器上......而不需要预先配置每个客户 -有多少内存可用 - -在虚拟情况下,虚拟化的全部意义在于统计地将物理资源在多个虚拟机的不同需求之间进行复 -用。对于RAM来说,这真的很难做到,而且在不改变内核的情况下,要做好这一点的努力基本上 -是失败的(除了一些广为人知的特殊情况下的工作负载)。具体来说,Xen Transcendent Memory -后端允许管理器拥有的RAM “fallow”,不仅可以在多个虚拟机之间进行“time-shared”, -而且页面可以被压缩和重复利用,以优化RAM的利用率。当客户操作系统被诱导交出未充分利用 -的RAM时(如 “selfballooning”),突然出现的意外内存压力可能会导致交换;frontswap -允许这些页面被交换到管理器RAM中或从管理器RAM中交换(如果整体主机系统内存条件允许), -从而减轻计划外交换可能带来的可怕的性能影响。 - -一个KVM的实现正在进行中,并且已经被RFC'ed到lkml。而且,利用frontswap,对NVM作为 -内存扩展技术的调查也在进行中。 - -* 当然,在某些情况下可能有性能上的优势,但frontswap的空间/时间开销是多少? - -如果 CONFIG_FRONTSWAP 被禁用,每个 frontswap 钩子都会编译成空,唯一的开销是每 -个 swapon'ed swap 设备的几个额外字节。如果 CONFIG_FRONTSWAP 被启用,但没有 -frontswap的 “backend” 寄存器,每读或写一个交换页就会有一个额外的全局变量,而不 -是零。如果 CONFIG_FRONTSWAP 被启用,并且有一个frontswap的backend寄存器,并且 -后端每次 “store” 请求都失败(即尽管声称可能,但没有提供内存),CPU 的开销仍然可以 -忽略不计 - 因为每次frontswap失败都是在交换页写到磁盘之前,系统很可能是 I/O 绑定 -的,无论如何使用一小部分的 CPU 都是不相关的。 - -至于空间,如果CONFIG_FRONTSWAP被启用,并且有一个frontswap的backend注册,那么 -每个交换设备的每个交换页都会被分配一个比特。这是在内核已经为每个交换设备的每个交换 -页分配的8位(在2.6.34之前是16位)上增加的。(Hugh Dickins观察到,frontswap可能 -会偷取现有的8个比特,但是我们以后再来担心这个小的优化问题)。对于标准的4K页面大小的 -非常大的交换盘(这很罕见),这是每32GB交换盘1MB开销。 - -当交换页存储在transcendent memory中而不是写到磁盘上时,有一个副作用,即这可能会 -产生更多的内存压力,有可能超过其他的优点。一个backend,比如zcache,必须实现策略 -来仔细(但动态地)管理内存限制,以确保这种情况不会发生。 - -* 好吧,那就用内核骇客能理解的术语来快速概述一下这个frontswap补丁的作用如何? - -我们假设在内核初始化过程中,一个frontswap 的 “backend” 已经注册了;这个注册表 -明这个frontswap 的 “backend” 可以访问一些不被内核直接访问的“内存”。它到底提 -供了多少内存是完全动态和随机的。 - -每当一个交换设备被交换时,就会调用frontswap_init(),把交换设备的编号(又称“类 -型”)作为一个参数传给它。这就通知了frontswap,以期待 “store” 与该号码相关的交 -换页的尝试。 - -每当交换子系统准备将一个页面写入交换设备时(参见swap_writepage()),就会调用 -frontswap_store。Frontswap与frontswap backend协商,如果backend说它没有空 -间,frontswap_store返回-1,内核就会照常把页换到交换设备上。注意,来自frontswap -backend的响应对内核来说是不可预测的;它可能选择从不接受一个页面,可能接受每九个 -页面,也可能接受每一个页面。但是如果backend确实接受了一个页面,那么这个页面的数 -据已经被复制并与类型和偏移量相关联了,而且backend保证了数据的持久性。在这种情况 -下,frontswap在交换设备的“frontswap_map” 中设置了一个位,对应于交换设备上的 -页面偏移量,否则它就会将数据写入该设备。 - -当交换子系统需要交换一个页面时(swap_readpage()),它首先调用frontswap_load(), -检查frontswap_map,看这个页面是否早先被frontswap backend接受。如果是,该页 -的数据就会从frontswap后端填充,换入就完成了。如果不是,正常的交换代码将被执行, -以便从真正的交换设备上获得这一页的数据。 - -所以每次frontswap backend接受一个页面时,交换设备的读取和(可能)交换设备的写 -入都被 “frontswap backend store” 和(可能)“frontswap backend loads” -所取代,这可能会快得多。 - -* frontswap不能被配置为一个 “特殊的” 交换设备,它的优先级要高于任何真正的交换 - 设备(例如像zswap,或者可能是swap-over-nbd/NFS)? - -首先,现有的交换子系统不允许有任何种类的交换层次结构。也许它可以被重写以适应层次 -结构,但这将需要相当大的改变。即使它被重写,现有的交换子系统也使用了块I/O层,它 -假定交换设备是固定大小的,其中的任何页面都是可线性寻址的。Frontswap几乎没有触 -及现有的交换子系统,而是围绕着块I/O子系统的限制,提供了大量的灵活性和动态性。 - -例如,frontswap backend对任何交换页的接受是完全不可预测的。这对frontswap backend -的定义至关重要,因为它赋予了backend完全动态的决定权。在zcache中,人们无法预 -先知道一个页面的可压缩性如何。可压缩性 “差” 的页面会被拒绝,而 “差” 本身也可 -以根据当前的内存限制动态地定义。 - -此外,frontswap是完全同步的,而真正的交换设备,根据定义,是异步的,并且使用 -块I/O。块I/O层不仅是不必要的,而且可能进行 “优化”,这对面向RAM的设备来说是 -不合适的,包括将一些页面的写入延迟相当长的时间。同步是必须的,以确保后端的动 -态性,并避免棘手的竞争条件,这将不必要地大大增加frontswap和/或块I/O子系统的 -复杂性。也就是说,只有最初的 “store” 和 “load” 操作是需要同步的。一个独立 -的异步线程可以自由地操作由frontswap存储的页面。例如,RAMster中的 “remotification” -线程使用标准的异步内核套接字,将压缩的frontswap页面移动到远程机器。同样, -KVM的客户方实现可以进行客户内压缩,并使用 “batched” hypercalls。 - -在虚拟化环境中,动态性允许管理程序(或主机操作系统)做“intelligent overcommit”。 -例如,它可以选择只接受页面,直到主机交换可能即将发生,然后强迫客户机做他们 -自己的交换。 - -transcendent memory规格的frontswap有一个坏处。因为任何 “store” 都可 -能失败,所以必须在一个真正的交换设备上有一个真正的插槽来交换页面。因此, -frontswap必须作为每个交换设备的 “影子” 来实现,它有可能容纳交换设备可能 -容纳的每一个页面,也有可能根本不容纳任何页面。这意味着frontswap不能包含比 -swap设备总数更多的页面。例如,如果在某些安装上没有配置交换设备,frontswap -就没有用。无交换设备的便携式设备仍然可以使用frontswap,但是这种设备的 -backend必须配置某种 “ghost” 交换设备,并确保它永远不会被使用。 - - -* 为什么会有这种关于 “重复存储” 的奇怪定义?如果一个页面以前被成功地存储过, - 难道它不能总是被成功地覆盖吗? - -几乎总是可以的,不,有时不能。考虑一个例子,数据被压缩了,原来的4K页面被压 -缩到了1K。现在,有人试图用不可压缩的数据覆盖该页,因此会占用整个4K。但是 -backend没有更多的空间了。在这种情况下,这个存储必须被拒绝。每当frontswap -拒绝一个会覆盖的存储时,它也必须使旧的数据作废,并确保它不再被访问。因为交 -换子系统会把新的数据写到读交换设备上,这是确保一致性的正确做法。 - -* 为什么frontswap补丁会创建新的头文件swapfile.h? - -frontswap代码依赖于一些swap子系统内部的数据结构,这些数据结构多年来一直 -在静态和全局之间来回移动。这似乎是一个合理的妥协:将它们定义为全局,但在一 -个新的包含文件中声明它们,该文件不被包含swap.h的大量源文件所包含。 - -Dan Magenheimer,最后更新于2012年4月9日 diff --git a/Documentation/translations/zh_CN/mm/index.rst b/Documentation/translations/zh_CN/mm/index.rst index 2f53e37b80497f..b950dd118be73e 100644 --- a/Documentation/translations/zh_CN/mm/index.rst +++ b/Documentation/translations/zh_CN/mm/index.rst @@ -42,7 +42,6 @@ Linux内存管理文档 damon/index free_page_reporting ksm - frontswap hmm hwpoison hugetlbfs_reserv diff --git a/MAINTAINERS b/MAINTAINERS index 9e4cfcd7998a01..9f0179682d91ce 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8404,13 +8404,6 @@ F: Documentation/power/freezing-of-tasks.rst F: include/linux/freezer.h F: kernel/freezer.c -FRONTSWAP API -M: Konrad Rzeszutek Wilk -L: linux-kernel@vger.kernel.org -S: Maintained -F: include/linux/frontswap.h -F: mm/frontswap.c - FS-CACHE: LOCAL CACHING FOR NETWORK FILESYSTEMS M: David Howells L: linux-cachefs@redhat.com (moderated for non-subscribers) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8dca4d6d96c7c7..74e3c3815696a6 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -17,6 +17,7 @@ #ifdef CONFIG_CMA #include #endif +#include #include #include "internal.h" diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h deleted file mode 100644 index eaa0ac5f900303..00000000000000 --- a/include/linux/frontswap.h +++ /dev/null @@ -1,91 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_FRONTSWAP_H -#define _LINUX_FRONTSWAP_H - -#include -#include -#include -#include - -struct frontswap_ops { - void (*init)(unsigned); /* this swap type was just swapon'ed */ - int (*store)(unsigned, pgoff_t, struct page *); /* store a page */ - int (*load)(unsigned, pgoff_t, struct page *, bool *); /* load a page */ - void (*invalidate_page)(unsigned, pgoff_t); /* page no longer needed */ - void (*invalidate_area)(unsigned); /* swap type just swapoff'ed */ -}; - -int frontswap_register_ops(const struct frontswap_ops *ops); - -extern void frontswap_init(unsigned type, unsigned long *map); -extern int __frontswap_store(struct page *page); -extern int __frontswap_load(struct page *page); -extern void __frontswap_invalidate_page(unsigned, pgoff_t); -extern void __frontswap_invalidate_area(unsigned); - -#ifdef CONFIG_FRONTSWAP -extern struct static_key_false frontswap_enabled_key; - -static inline bool frontswap_enabled(void) -{ - return static_branch_unlikely(&frontswap_enabled_key); -} - -static inline void frontswap_map_set(struct swap_info_struct *p, - unsigned long *map) -{ - p->frontswap_map = map; -} - -static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) -{ - return p->frontswap_map; -} -#else -/* all inline routines become no-ops and all externs are ignored */ - -static inline bool frontswap_enabled(void) -{ - return false; -} - -static inline void frontswap_map_set(struct swap_info_struct *p, - unsigned long *map) -{ -} - -static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) -{ - return NULL; -} -#endif - -static inline int frontswap_store(struct page *page) -{ - if (frontswap_enabled()) - return __frontswap_store(page); - - return -1; -} - -static inline int frontswap_load(struct page *page) -{ - if (frontswap_enabled()) - return __frontswap_load(page); - - return -1; -} - -static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) -{ - if (frontswap_enabled()) - __frontswap_invalidate_page(type, offset); -} - -static inline void frontswap_invalidate_area(unsigned type) -{ - if (frontswap_enabled()) - __frontswap_invalidate_area(type); -} - -#endif /* _LINUX_FRONTSWAP_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 456546443f1f30..bb5adc6041448a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -302,10 +302,6 @@ struct swap_info_struct { struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ struct completion comp; /* seldom referenced */ -#ifdef CONFIG_FRONTSWAP - unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ - atomic_t frontswap_pages; /* frontswap pages in-use counter */ -#endif spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, @@ -630,11 +626,6 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif -#ifdef CONFIG_ZSWAP -extern u64 zswap_pool_total_size; -extern atomic_t zswap_stored_pages; -#endif - #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp); static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 7ed529a77c5b36..99e3ed469e8877 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -2,11 +2,6 @@ #ifndef _LINUX_SWAPFILE_H #define _LINUX_SWAPFILE_H -/* - * these were static in swapfile.c but frontswap.c needs them and we don't - * want to expose them to the dozens of source files that include swap.h - */ -extern struct swap_info_struct *swap_info[]; extern unsigned long generic_max_swapfile_size(void); unsigned long arch_max_swapfile_size(void); diff --git a/include/linux/zswap.h b/include/linux/zswap.h new file mode 100644 index 00000000000000..850c377d9b6df8 --- /dev/null +++ b/include/linux/zswap.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ZSWAP_H +#define _LINUX_ZSWAP_H + +#include +#include + +extern u64 zswap_pool_total_size; +extern atomic_t zswap_stored_pages; + +#ifdef CONFIG_ZSWAP + +bool zswap_store(struct page *page); +bool zswap_load(struct page *page); +void zswap_invalidate(int type, pgoff_t offset); +void zswap_swapon(int type); +void zswap_swapoff(int type); + +#else + +static inline bool zswap_store(struct page *page) +{ + return false; +} + +static inline bool zswap_load(struct page *page) +{ + return false; +} + +static inline void zswap_invalidate(int type, pgoff_t offset) {} +static inline void zswap_swapon(int type) {} +static inline void zswap_swapoff(int type) {} + +#endif + +#endif /* _LINUX_ZSWAP_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 1959d048bbf560..5fe49c030961ec 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -25,7 +25,6 @@ menuconfig SWAP config ZSWAP bool "Compressed cache for swap pages" depends on SWAP - select FRONTSWAP select CRYPTO select ZPOOL help @@ -873,9 +872,6 @@ config USE_PERCPU_NUMA_NODE_ID config HAVE_SETUP_PER_CPU_AREA bool -config FRONTSWAP - bool - config CMA bool "Contiguous Memory Allocator" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index 678530a073261f..e6d9a1d5e84df1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -72,7 +72,6 @@ ifdef CONFIG_MMU endif obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o -obj-$(CONFIG_FRONTSWAP) += frontswap.o obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o diff --git a/mm/frontswap.c b/mm/frontswap.c deleted file mode 100644 index 2fb5df3384b8eb..00000000000000 --- a/mm/frontswap.c +++ /dev/null @@ -1,283 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Frontswap frontend - * - * This code provides the generic "frontend" layer to call a matching - * "backend" driver implementation of frontswap. See - * Documentation/mm/frontswap.rst for more information. - * - * Copyright (C) 2009-2012 Oracle Corp. All rights reserved. - * Author: Dan Magenheimer - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key); - -/* - * frontswap_ops are added by frontswap_register_ops, and provide the - * frontswap "backend" implementation functions. Multiple implementations - * may be registered, but implementations can never deregister. This - * is a simple singly-linked list of all registered implementations. - */ -static const struct frontswap_ops *frontswap_ops __read_mostly; - -#ifdef CONFIG_DEBUG_FS -/* - * Counters available via /sys/kernel/debug/frontswap (if debugfs is - * properly configured). These are for information only so are not protected - * against increment races. - */ -static u64 frontswap_loads; -static u64 frontswap_succ_stores; -static u64 frontswap_failed_stores; -static u64 frontswap_invalidates; - -static inline void inc_frontswap_loads(void) -{ - data_race(frontswap_loads++); -} -static inline void inc_frontswap_succ_stores(void) -{ - data_race(frontswap_succ_stores++); -} -static inline void inc_frontswap_failed_stores(void) -{ - data_race(frontswap_failed_stores++); -} -static inline void inc_frontswap_invalidates(void) -{ - data_race(frontswap_invalidates++); -} -#else -static inline void inc_frontswap_loads(void) { } -static inline void inc_frontswap_succ_stores(void) { } -static inline void inc_frontswap_failed_stores(void) { } -static inline void inc_frontswap_invalidates(void) { } -#endif - -/* - * Due to the asynchronous nature of the backends loading potentially - * _after_ the swap system has been activated, we have chokepoints - * on all frontswap functions to not call the backend until the backend - * has registered. - * - * This would not guards us against the user deciding to call swapoff right as - * we are calling the backend to initialize (so swapon is in action). - * Fortunately for us, the swapon_mutex has been taken by the callee so we are - * OK. The other scenario where calls to frontswap_store (called via - * swap_writepage) is racing with frontswap_invalidate_area (called via - * swapoff) is again guarded by the swap subsystem. - * - * While no backend is registered all calls to frontswap_[store|load| - * invalidate_area|invalidate_page] are ignored or fail. - * - * The time between the backend being registered and the swap file system - * calling the backend (via the frontswap_* functions) is indeterminate as - * frontswap_ops is not atomic_t (or a value guarded by a spinlock). - * That is OK as we are comfortable missing some of these calls to the newly - * registered backend. - * - * Obviously the opposite (unloading the backend) must be done after all - * the frontswap_[store|load|invalidate_area|invalidate_page] start - * ignoring or failing the requests. However, there is currently no way - * to unload a backend once it is registered. - */ - -/* - * Register operations for frontswap - */ -int frontswap_register_ops(const struct frontswap_ops *ops) -{ - if (frontswap_ops) - return -EINVAL; - - frontswap_ops = ops; - static_branch_inc(&frontswap_enabled_key); - return 0; -} - -/* - * Called when a swap device is swapon'd. - */ -void frontswap_init(unsigned type, unsigned long *map) -{ - struct swap_info_struct *sis = swap_info[type]; - - VM_BUG_ON(sis == NULL); - - /* - * p->frontswap is a bitmap that we MUST have to figure out which page - * has gone in frontswap. Without it there is no point of continuing. - */ - if (WARN_ON(!map)) - return; - /* - * Irregardless of whether the frontswap backend has been loaded - * before this function or it will be later, we _MUST_ have the - * p->frontswap set to something valid to work properly. - */ - frontswap_map_set(sis, map); - - if (!frontswap_enabled()) - return; - frontswap_ops->init(type); -} - -static bool __frontswap_test(struct swap_info_struct *sis, - pgoff_t offset) -{ - if (sis->frontswap_map) - return test_bit(offset, sis->frontswap_map); - return false; -} - -static inline void __frontswap_set(struct swap_info_struct *sis, - pgoff_t offset) -{ - set_bit(offset, sis->frontswap_map); - atomic_inc(&sis->frontswap_pages); -} - -static inline void __frontswap_clear(struct swap_info_struct *sis, - pgoff_t offset) -{ - clear_bit(offset, sis->frontswap_map); - atomic_dec(&sis->frontswap_pages); -} - -/* - * "Store" data from a page to frontswap and associate it with the page's - * swaptype and offset. Page must be locked and in the swap cache. - * If frontswap already contains a page with matching swaptype and - * offset, the frontswap implementation may either overwrite the data and - * return success or invalidate the page from frontswap and return failure. - */ -int __frontswap_store(struct page *page) -{ - int ret = -1; - swp_entry_t entry = { .val = page_private(page), }; - int type = swp_type(entry); - struct swap_info_struct *sis = swap_info[type]; - pgoff_t offset = swp_offset(entry); - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(sis == NULL); - - /* - * If a dup, we must remove the old page first; we can't leave the - * old page no matter if the store of the new page succeeds or fails, - * and we can't rely on the new page replacing the old page as we may - * not store to the same implementation that contains the old page. - */ - if (__frontswap_test(sis, offset)) { - __frontswap_clear(sis, offset); - frontswap_ops->invalidate_page(type, offset); - } - - ret = frontswap_ops->store(type, offset, page); - if (ret == 0) { - __frontswap_set(sis, offset); - inc_frontswap_succ_stores(); - } else { - inc_frontswap_failed_stores(); - } - - return ret; -} - -/* - * "Get" data from frontswap associated with swaptype and offset that were - * specified when the data was put to frontswap and use it to fill the - * specified page with data. Page must be locked and in the swap cache. - */ -int __frontswap_load(struct page *page) -{ - int ret = -1; - swp_entry_t entry = { .val = page_private(page), }; - int type = swp_type(entry); - struct swap_info_struct *sis = swap_info[type]; - pgoff_t offset = swp_offset(entry); - bool exclusive = false; - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(!PageLocked(page)); - VM_BUG_ON(sis == NULL); - - if (!__frontswap_test(sis, offset)) - return -1; - - /* Try loading from each implementation, until one succeeds. */ - ret = frontswap_ops->load(type, offset, page, &exclusive); - if (ret == 0) { - inc_frontswap_loads(); - if (exclusive) { - SetPageDirty(page); - __frontswap_clear(sis, offset); - } - } - return ret; -} - -/* - * Invalidate any data from frontswap associated with the specified swaptype - * and offset so that a subsequent "get" will fail. - */ -void __frontswap_invalidate_page(unsigned type, pgoff_t offset) -{ - struct swap_info_struct *sis = swap_info[type]; - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(sis == NULL); - - if (!__frontswap_test(sis, offset)) - return; - - frontswap_ops->invalidate_page(type, offset); - __frontswap_clear(sis, offset); - inc_frontswap_invalidates(); -} - -/* - * Invalidate all data from frontswap associated with all offsets for the - * specified swaptype. - */ -void __frontswap_invalidate_area(unsigned type) -{ - struct swap_info_struct *sis = swap_info[type]; - - VM_BUG_ON(!frontswap_ops); - VM_BUG_ON(sis == NULL); - - if (sis->frontswap_map == NULL) - return; - - frontswap_ops->invalidate_area(type); - atomic_set(&sis->frontswap_pages, 0); - bitmap_zero(sis->frontswap_map, sis->max); -} - -static int __init init_frontswap(void) -{ -#ifdef CONFIG_DEBUG_FS - struct dentry *root = debugfs_create_dir("frontswap", NULL); - if (root == NULL) - return -ENXIO; - debugfs_create_u64("loads", 0444, root, &frontswap_loads); - debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores); - debugfs_create_u64("failed_stores", 0444, root, - &frontswap_failed_stores); - debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates); -#endif - return 0; -} - -module_init(init_frontswap); diff --git a/mm/page_io.c b/mm/page_io.c index ff4156a44d5d72..5d0baba3578b2e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -19,12 +19,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include "swap.h" static void __end_swap_bio_write(struct bio *bio) @@ -195,7 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } - if (frontswap_store(&folio->page) == 0) { + if (zswap_store(&folio->page)) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); @@ -512,7 +512,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } delayacct_swapin_start(); - if (frontswap_load(page) == 0) { + if (zswap_load(page)) { SetPageUptodate(page); unlock_page(page); } else if (data_race(sis->flags & SWP_FS_OPS)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 346e22b8ae970c..e04eb9c0482db2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -35,13 +35,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -95,7 +95,7 @@ static PLIST_HEAD(swap_active_head); static struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); -struct swap_info_struct *swap_info[MAX_SWAPFILES]; +static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -744,7 +744,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, swap_slot_free_notify = NULL; while (offset <= end) { arch_swap_invalidate_page(si->type, offset); - frontswap_invalidate_page(si->type, offset); + zswap_invalidate(si->type, offset); if (swap_slot_free_notify) swap_slot_free_notify(si->bdev, offset); offset++; @@ -2343,11 +2343,10 @@ static void _enable_swap_info(struct swap_info_struct *p) static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, - struct swap_cluster_info *cluster_info, - unsigned long *frontswap_map) + struct swap_cluster_info *cluster_info) { - if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_init(p->type, frontswap_map); + zswap_swapon(p->type); + spin_lock(&swap_lock); spin_lock(&p->lock); setup_swap_info(p, prio, swap_map, cluster_info); @@ -2390,7 +2389,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct swap_info_struct *p = NULL; unsigned char *swap_map; struct swap_cluster_info *cluster_info; - unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; @@ -2515,12 +2513,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->swap_map = NULL; cluster_info = p->cluster_info; p->cluster_info = NULL; - frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); arch_swap_invalidate_area(p->type); - frontswap_invalidate_area(p->type); - frontswap_map_set(p, NULL); + zswap_swapoff(p->type); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; @@ -2528,7 +2524,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->cluster_next_cpu = NULL; vfree(swap_map); kvfree(cluster_info); - kvfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); @@ -2995,7 +2990,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned long maxpages; unsigned char *swap_map = NULL; struct swap_cluster_info *cluster_info = NULL; - unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; bool inced_nr_rotate_swap = false; @@ -3135,11 +3129,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap_unlock_inode; } - /* frontswap enabled? set up bit-per-page map for frontswap */ - if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages), - sizeof(long), - GFP_KERNEL); if ((swap_flags & SWAP_FLAG_DISCARD) && p->bdev && bdev_max_discard_sectors(p->bdev)) { @@ -3192,16 +3181,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); + enable_swap_info(p, prio, swap_map, cluster_info); - pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", + pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", (p->flags & SWP_AREA_DISCARD) ? "s" : "", - (p->flags & SWP_PAGE_DISCARD) ? "c" : "", - (frontswap_map) ? "FS" : ""); + (p->flags & SWP_PAGE_DISCARD) ? "c" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -3231,7 +3219,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) spin_unlock(&swap_lock); vfree(swap_map); kvfree(cluster_info); - kvfree(frontswap_map); if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) diff --git a/mm/zswap.c b/mm/zswap.c index 258e4e17799a02..be1b6417ef5c37 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -2,7 +2,7 @@ /* * zswap.c - zswap driver file * - * zswap is a backend for frontswap that takes pages that are in the process + * zswap is a cache that takes pages that are in the process * of being swapped out and attempts to compress and store them in a * RAM-based memory pool. This can result in a significant I/O reduction on * the swap device and, in the case where decompressing from RAM is faster @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -28,7 +27,7 @@ #include #include #include - +#include #include #include #include @@ -1084,7 +1083,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * * This can be thought of as a "resumed writeback" of the page * to the swap device. We are basically resuming the same swap - * writeback path that was intercepted with the frontswap_store() + * writeback path that was intercepted with the zswap_store() * in the first place. After the page has been decompressed into * the swap cache, the compressed version stored by zswap can be * freed. @@ -1224,13 +1223,11 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } -/********************************* -* frontswap hooks -**********************************/ -/* attempts to compress and store an single page */ -static int zswap_frontswap_store(unsigned type, pgoff_t offset, - struct page *page) +bool zswap_store(struct page *page) { + swp_entry_t swp = { .val = page_private(page), }; + int type = swp_type(swp); + pgoff_t offset = swp_offset(swp); struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; @@ -1238,23 +1235,22 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct obj_cgroup *objcg = NULL; struct zswap_pool *pool; struct zpool *zpool; - int ret; unsigned int dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; gfp_t gfp; + int ret; + + VM_WARN_ON_ONCE(!PageLocked(page)); + VM_WARN_ON_ONCE(!PageSwapCache(page)); /* THP isn't supported */ - if (PageTransHuge(page)) { - ret = -EINVAL; - goto reject; - } + if (PageTransHuge(page)) + return false; - if (!zswap_enabled || !tree) { - ret = -ENODEV; - goto reject; - } + if (!zswap_enabled || !tree) + return false; /* * XXX: zswap reclaim does not work with cgroups yet. Without a @@ -1262,10 +1258,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, * local cgroup limits. */ objcg = get_obj_cgroup_from_page(page); - if (objcg && !obj_cgroup_may_zswap(objcg)) { - ret = -ENOMEM; + if (objcg && !obj_cgroup_may_zswap(objcg)) goto reject; - } /* reclaim space if needed */ if (zswap_is_full()) { @@ -1275,10 +1269,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } if (zswap_pool_reached_full) { - if (!zswap_can_accept()) { - ret = -ENOMEM; + if (!zswap_can_accept()) goto shrink; - } else + else zswap_pool_reached_full = false; } @@ -1286,7 +1279,6 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry = zswap_entry_cache_alloc(GFP_KERNEL); if (!entry) { zswap_reject_kmemcache_fail++; - ret = -ENOMEM; goto reject; } @@ -1303,17 +1295,13 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, kunmap_atomic(src); } - if (!zswap_non_same_filled_pages_enabled) { - ret = -EINVAL; + if (!zswap_non_same_filled_pages_enabled) goto freepage; - } /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); - if (!entry->pool) { - ret = -EINVAL; + if (!entry->pool) goto freepage; - } /* compress */ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); @@ -1333,19 +1321,17 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, * synchronous in fact. * Theoretically, acomp supports users send multiple acomp requests in one * acomp instance, then get those requests done simultaneously. but in this - * case, frontswap actually does store and load page by page, there is no + * case, zswap actually does store and load page by page, there is no * existing method to send the second page before the first page is done - * in one thread doing frontswap. + * in one thread doing zwap. * but in different threads running on different cpu, we have different * acomp instance, so multiple threads can do (de)compression in parallel. */ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait); dlen = acomp_ctx->req->dlen; - if (ret) { - ret = -EINVAL; + if (ret) goto put_dstmem; - } /* store */ zpool = zswap_find_zpool(entry); @@ -1381,15 +1367,12 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* map */ spin_lock(&tree->lock); - do { - ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); - if (ret == -EEXIST) { - zswap_duplicate_entry++; - /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, dupentry); - zswap_entry_put(tree, dupentry); - } - } while (ret == -EEXIST); + while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); + } if (entry->length) { spin_lock(&entry->pool->lru_lock); list_add(&entry->lru, &entry->pool->lru); @@ -1402,7 +1385,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_update_total_size(); count_vm_event(ZSWPOUT); - return 0; + return true; put_dstmem: mutex_unlock(acomp_ctx->mutex); @@ -1412,23 +1395,20 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, reject: if (objcg) obj_cgroup_put(objcg); - return ret; + return false; shrink: pool = zswap_pool_last_get(); if (pool) queue_work(shrink_wq, &pool->shrink_work); - ret = -ENOMEM; goto reject; } -/* - * returns 0 if the page was successfully decompressed - * return -1 on entry not found or error -*/ -static int zswap_frontswap_load(unsigned type, pgoff_t offset, - struct page *page, bool *exclusive) +bool zswap_load(struct page *page) { + swp_entry_t swp = { .val = page_private(page), }; + int type = swp_type(swp); + pgoff_t offset = swp_offset(swp); struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; struct scatterlist input, output; @@ -1436,15 +1416,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, u8 *src, *dst, *tmp; struct zpool *zpool; unsigned int dlen; - int ret; + bool ret; + + VM_WARN_ON_ONCE(!PageLocked(page)); /* find */ spin_lock(&tree->lock); entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { - /* entry was written back */ spin_unlock(&tree->lock); - return -1; + return false; } spin_unlock(&tree->lock); @@ -1452,7 +1433,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, dst = kmap_atomic(page); zswap_fill_page(dst, entry->value); kunmap_atomic(dst); - ret = 0; + ret = true; goto stats; } @@ -1460,7 +1441,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, if (!zpool_can_sleep_mapped(zpool)) { tmp = kmalloc(entry->length, GFP_KERNEL); if (!tmp) { - ret = -ENOMEM; + ret = false; goto freeentry; } } @@ -1481,7 +1462,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, sg_init_table(&output, 1); sg_set_page(&output, page, PAGE_SIZE, 0); acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); - ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + if (crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait)) + WARN_ON(1); mutex_unlock(acomp_ctx->mutex); if (zpool_can_sleep_mapped(zpool)) @@ -1489,16 +1471,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, else kfree(tmp); - BUG_ON(ret); + ret = true; stats: count_vm_event(ZSWPIN); if (entry->objcg) count_objcg_event(entry->objcg, ZSWPIN); freeentry: spin_lock(&tree->lock); - if (!ret && zswap_exclusive_loads_enabled) { + if (ret && zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); - *exclusive = true; + SetPageDirty(page); } else if (entry->length) { spin_lock(&entry->pool->lru_lock); list_move(&entry->lru, &entry->pool->lru); @@ -1510,8 +1492,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, return ret; } -/* frees an entry in zswap */ -static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +void zswap_invalidate(int type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; @@ -1528,8 +1509,22 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) spin_unlock(&tree->lock); } -/* frees all zswap entries for the given swap type */ -static void zswap_frontswap_invalidate_area(unsigned type) +void zswap_swapon(int type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(*tree), GFP_KERNEL); + if (!tree) { + pr_err("alloc failed, zswap disabled for swap type %d\n", type); + return; + } + + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; +} + +void zswap_swapoff(int type) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *n; @@ -1547,29 +1542,6 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static void zswap_frontswap_init(unsigned type) -{ - struct zswap_tree *tree; - - tree = kzalloc(sizeof(*tree), GFP_KERNEL); - if (!tree) { - pr_err("alloc failed, zswap disabled for swap type %d\n", type); - return; - } - - tree->rbroot = RB_ROOT; - spin_lock_init(&tree->lock); - zswap_trees[type] = tree; -} - -static const struct frontswap_ops zswap_frontswap_ops = { - .store = zswap_frontswap_store, - .load = zswap_frontswap_load, - .invalidate_page = zswap_frontswap_invalidate_page, - .invalidate_area = zswap_frontswap_invalidate_area, - .init = zswap_frontswap_init -}; - /********************************* * debugfs functions **********************************/ @@ -1658,16 +1630,11 @@ static int zswap_setup(void) if (!shrink_wq) goto fallback_fail; - ret = frontswap_register_ops(&zswap_frontswap_ops); - if (ret) - goto destroy_wq; if (zswap_debugfs_init()) pr_warn("debugfs initialization failed\n"); zswap_init_state = ZSWAP_INIT_SUCCEED; return 0; -destroy_wq: - destroy_workqueue(shrink_wq); fallback_fail: if (pool) zswap_pool_destroy(pool); From 34f4c198bfbe86612c368eb122002787acecaa93 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:40 +0100 Subject: [PATCH 243/489] zswap: make zswap_store() take a folio Patch series "Followup folio conversions for zswap". With frontswap killed, it's worth converting the zswap_load() and zswap_store() functions to take a folio instead of a page pointer. They aren't converted to support large folios, but there are a lot of unnecessary calls to compound_head() that are removed by these patches. This patch (of 4): Only convert a few easy parts of this function to use the folio passed in; convert back to struct page for the majority of it. This does remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230715042343.434588-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- mm/page_io.c | 2 +- mm/zswap.c | 13 +++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 850c377d9b6df8..9f318c8bc367fa 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -10,7 +10,7 @@ extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP -bool zswap_store(struct page *page); +bool zswap_store(struct folio *folio); bool zswap_load(struct page *page); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); @@ -18,7 +18,7 @@ void zswap_swapoff(int type); #else -static inline bool zswap_store(struct page *page) +static inline bool zswap_store(struct folio *folio) { return false; } diff --git a/mm/page_io.c b/mm/page_io.c index 5d0baba3578b2e..ac89685b562bb2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -195,7 +195,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) folio_unlock(folio); return ret; } - if (zswap_store(&folio->page)) { + if (zswap_store(folio)) { folio_start_writeback(folio); folio_unlock(folio); folio_end_writeback(folio); diff --git a/mm/zswap.c b/mm/zswap.c index be1b6417ef5c37..df3054e6a3a99e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1223,11 +1223,12 @@ static void zswap_fill_page(void *ptr, unsigned long value) memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); } -bool zswap_store(struct page *page) +bool zswap_store(struct folio *folio) { - swp_entry_t swp = { .val = page_private(page), }; + swp_entry_t swp = folio_swap_entry(folio); int type = swp_type(swp); pgoff_t offset = swp_offset(swp); + struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry, *dupentry; struct scatterlist input, output; @@ -1242,11 +1243,11 @@ bool zswap_store(struct page *page) gfp_t gfp; int ret; - VM_WARN_ON_ONCE(!PageLocked(page)); - VM_WARN_ON_ONCE(!PageSwapCache(page)); + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); - /* THP isn't supported */ - if (PageTransHuge(page)) + /* Large folios aren't supported */ + if (folio_test_large(folio)) return false; if (!zswap_enabled || !tree) From 074e3e262adb02a7a42e32e0aadfb6b6cf416854 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:41 +0100 Subject: [PATCH 244/489] memcg: convert get_obj_cgroup_from_page to get_obj_cgroup_from_folio As the one caller now has a folio, pass it in and use it. Removes three calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 8 ++++---- mm/zswap.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 58eb7ca6569952..058fb748e12844 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1759,7 +1759,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); struct obj_cgroup *get_obj_cgroup_from_current(void); -struct obj_cgroup *get_obj_cgroup_from_page(struct page *page); +struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); @@ -1843,7 +1843,7 @@ static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } -static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +static inline struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { return NULL; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51772df1abc522..062d925336cbdb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3036,21 +3036,21 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) return objcg; } -struct obj_cgroup *get_obj_cgroup_from_page(struct page *page) +struct obj_cgroup *get_obj_cgroup_from_folio(struct folio *folio) { struct obj_cgroup *objcg; if (!memcg_kmem_online()) return NULL; - if (PageMemcgKmem(page)) { - objcg = __folio_objcg(page_folio(page)); + if (folio_memcg_kmem(folio)) { + objcg = __folio_objcg(folio); obj_cgroup_get(objcg); } else { struct mem_cgroup *memcg; rcu_read_lock(); - memcg = __folio_memcg(page_folio(page)); + memcg = __folio_memcg(folio); if (memcg) objcg = __get_obj_cgroup_from_memcg(memcg); else diff --git a/mm/zswap.c b/mm/zswap.c index df3054e6a3a99e..9df33298f2dca9 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1258,7 +1258,7 @@ bool zswap_store(struct folio *folio) * cgroup-aware entry LRU, we will push out entries system-wide based on * local cgroup limits. */ - objcg = get_obj_cgroup_from_page(page); + objcg = get_obj_cgroup_from_folio(folio); if (objcg && !obj_cgroup_may_zswap(objcg)) goto reject; From fbcec6a3a09b309900f1ecef8954721d93555abd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:42 +0100 Subject: [PATCH 245/489] swap: remove some calls to compound_head() in swap_readpage() Replace six implicit calls to compound_head() with one call to page_folio(). Link: https://lkml.kernel.org/r/20230715042343.434588-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/page_io.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index ac89685b562bb2..e3d62c1a083471 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -492,14 +492,15 @@ static void swap_readpage_bdev_async(struct page *page, void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) { + struct folio *folio = page_folio(page); struct swap_info_struct *sis = page_swap_info(page); - bool workingset = PageWorkingset(page); + bool workingset = folio_test_workingset(folio); unsigned long pflags; bool in_thrashing; - VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageUptodate(page), page); + VM_BUG_ON_FOLIO(!folio_test_swapcache(folio) && !synchronous, folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio); /* * Count submission time as memory stall and delay. When the device @@ -513,8 +514,8 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) delayacct_swapin_start(); if (zswap_load(page)) { - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { swap_readpage_fs(page, plug); } else if (synchronous || (sis->flags & SWP_SYNCHRONOUS_IO)) { From ca54f6d89d60abf3e7dea68c95dfd442eeece212 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 15 Jul 2023 05:23:43 +0100 Subject: [PATCH 246/489] zswap: make zswap_load() take a folio Only convert a few easy parts of this function to use the folio passed in; convert back to struct page for the majority of it. Removes three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20230715042343.434588-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Christoph Hellwig Cc: Domenico Cerasuolo Cc: Johannes Weiner Cc: Nhat Pham Cc: Vitaly Wool Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/zswap.h | 4 ++-- mm/page_io.c | 2 +- mm/zswap.c | 9 +++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 9f318c8bc367fa..2a60ce39cfde19 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -11,7 +11,7 @@ extern atomic_t zswap_stored_pages; #ifdef CONFIG_ZSWAP bool zswap_store(struct folio *folio); -bool zswap_load(struct page *page); +bool zswap_load(struct folio *folio); void zswap_invalidate(int type, pgoff_t offset); void zswap_swapon(int type); void zswap_swapoff(int type); @@ -23,7 +23,7 @@ static inline bool zswap_store(struct folio *folio) return false; } -static inline bool zswap_load(struct page *page) +static inline bool zswap_load(struct folio *folio) { return false; } diff --git a/mm/page_io.c b/mm/page_io.c index e3d62c1a083471..fe4c21af23f269 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -513,7 +513,7 @@ void swap_readpage(struct page *page, bool synchronous, struct swap_iocb **plug) } delayacct_swapin_start(); - if (zswap_load(page)) { + if (zswap_load(folio)) { folio_mark_uptodate(folio); folio_unlock(folio); } else if (data_race(sis->flags & SWP_FS_OPS)) { diff --git a/mm/zswap.c b/mm/zswap.c index 9df33298f2dca9..7cc4a2baa71322 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1405,11 +1405,12 @@ bool zswap_store(struct folio *folio) goto reject; } -bool zswap_load(struct page *page) +bool zswap_load(struct folio *folio) { - swp_entry_t swp = { .val = page_private(page), }; + swp_entry_t swp = folio_swap_entry(folio); int type = swp_type(swp); pgoff_t offset = swp_offset(swp); + struct page *page = &folio->page; struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; struct scatterlist input, output; @@ -1419,7 +1420,7 @@ bool zswap_load(struct page *page) unsigned int dlen; bool ret; - VM_WARN_ON_ONCE(!PageLocked(page)); + VM_WARN_ON_ONCE(!folio_test_locked(folio)); /* find */ spin_lock(&tree->lock); @@ -1481,7 +1482,7 @@ bool zswap_load(struct page *page) spin_lock(&tree->lock); if (ret && zswap_exclusive_loads_enabled) { zswap_invalidate_entry(tree, entry); - SetPageDirty(page); + folio_mark_dirty(folio); } else if (entry->length) { spin_lock(&entry->pool->lru_lock); list_move(&entry->lru, &entry->pool->lru); From c0a5d93a885b42c22085ad0f39233795f4a578e0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 22:58:10 +0800 Subject: [PATCH 247/489] mm/page_ext: add common function to get client data from page_ext Patch series "add page_ext_data to get client data in page_ext". Current clients get data from page_ext by adding offset which is auto generated in page_ext core and exposes the data layout design inside page_ext core. This series adds a page_ext_data() to hide this from clients. Benefits include: 1. Future clients can call page_ext_data directly instead of defining a new function like get_page_owner to get the data. 2. There is no change to clients if the layout of page_ext data changes. This patch (of 3): Add common page_ext_data function to get client data. This could hide offset which is auto generated in page_ext core and expose the desgin of page_ext data layout. Link: https://lkml.kernel.org/r/20230718145812.1991717-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230718145812.1991717-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Andrew Morton Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 67314f648aeba5..658d8d39a10e33 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -82,6 +82,12 @@ static inline void page_ext_init(void) extern struct page_ext *page_ext_get(struct page *page); extern void page_ext_put(struct page_ext *page_ext); +static inline void *page_ext_data(struct page_ext *page_ext, + struct page_ext_operations *ops) +{ + return (void *)(page_ext) + ops->offset; +} + static inline struct page_ext *page_ext_next(struct page_ext *curr) { void *next = curr; From d981e2804c92b505e76f44e66909f3ae805d3aa2 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 22:58:11 +0800 Subject: [PATCH 248/489] mm/page_ext: use page_ext_data helper in page_table_check Use page_ext_data helper in page_table_check to avoid access offset directly. Link: https://lkml.kernel.org/r/20230718145812.1991717-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Andrew Morton Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/page_table_check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 84c8163984e5ce..46e77c12c81ecf 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -51,7 +51,7 @@ struct page_ext_operations page_table_check_ops = { static struct page_table_check *get_page_table_check(struct page_ext *page_ext) { BUG_ON(!page_ext); - return (void *)(page_ext) + page_table_check_ops.offset; + return page_ext_data(page_ext, &page_table_check_ops); } /* From 1cac4c0760ecd0c33b11b7b5c609264ea6bed5ed Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 18 Jul 2023 22:58:12 +0800 Subject: [PATCH 249/489] mm/page_ext: use page_ext_data helper in page_owner Use page_ext_data helper in page_owner to avoid access offset directly. Link: https://lkml.kernel.org/r/20230718145812.1991717-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Andrew Morton Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index c93baef0148f18..4e2723e1b300d8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -104,7 +104,7 @@ struct page_ext_operations page_owner_ops = { static inline struct page_owner *get_page_owner(struct page_ext *page_ext) { - return (void *)page_ext + page_owner_ops.offset; + return page_ext_data(page_ext, &page_owner_ops); } static noinline depot_stack_handle_t save_stack(gfp_t flags) From 68af05143fd4b49d4b12eab8d63c91ffbc7c4e5e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Jul 2023 16:54:09 +0200 Subject: [PATCH 250/489] kernel/iomem.c: remove __weak ioremap_cache helper No portable code calls into this function any more, and on architectures that don't use or define their own, it causes a warning: kernel/iomem.c:10:22: warning: no previous prototype for 'ioremap_cache' [-Wmissing-prototypes] 10 | __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) Fold it into the only caller that uses it on architectures without the #define. Note that the fallback to ioremap is probably still wrong on those architectures, but this is what it's always done there. Link: https://lkml.kernel.org/r/20230726145432.1617809-1-arnd@kernel.org Signed-off-by: Arnd Bergmann Reviewed-by: Baoquan He Reviewed-by: Christoph Hellwig Signed-off-by: Andrew Morton --- kernel/iomem.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/iomem.c b/kernel/iomem.c index 9682471e647171..dc2120776e1c3d 100644 --- a/kernel/iomem.c +++ b/kernel/iomem.c @@ -5,18 +5,14 @@ #include #include -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif - #ifndef arch_memremap_wb static void *arch_memremap_wb(resource_size_t offset, unsigned long size) { +#ifdef ioremap_cache return (__force void *)ioremap_cache(offset, size); +#else + return (__force void *)ioremap(offset, size); +#endif } #endif From 56c67049c0ee135868e83ce36ac8d3e037e03f82 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Jul 2023 12:22:23 -0400 Subject: [PATCH 251/489] mm: zswap: use zswap_invalidate_entry() for duplicates Patch series "mm: zswap: three cleanups". Three small cleanups to zswap, the first one suggested by Yosry during the frontswap removal. This patch (of 3): Minor cleanup. Instead of open-coding the tree deletion and the put, use the zswap_invalidate_entry() convenience helper. Link: https://lkml.kernel.org/r/20230727162343.1415598-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20230727162343.1415598-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Yosry Ahmed Reviewed-by: Yosry Ahmed Cc: Domenico Cerasuolo Cc: Nhat Pham Cc: Barry Song Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 7cc4a2baa71322..93707a1799b8a5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1370,9 +1370,7 @@ bool zswap_store(struct folio *folio) spin_lock(&tree->lock); while (zswap_rb_insert(&tree->rbroot, entry, &dupentry) == -EEXIST) { zswap_duplicate_entry++; - /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, dupentry); - zswap_entry_put(tree, dupentry); + zswap_invalidate_entry(tree, dupentry); } if (entry->length) { spin_lock(&entry->pool->lru_lock); From 7310895779624a11153d74a2132f01be6e360b7c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Jul 2023 12:22:24 -0400 Subject: [PATCH 252/489] mm: zswap: tighten up entry invalidation Removing a zswap entry from the tree is tied to an explicit operation that's supposed to drop the base reference: swap invalidation, exclusive load, duplicate store. Don't silently remove the entry on final put, but instead warn if an entry is in tree without reference. While in that diff context, convert a BUG_ON to a WARN_ON_ONCE. No need to crash on a refcount underflow. Link: https://lkml.kernel.org/r/20230727162343.1415598-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Yosry Ahmed Cc: Barry Song Cc: Domenico Cerasuolo Cc: Nhat Pham Cc: Seth Jennings Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/zswap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 93707a1799b8a5..ea921b25c245b5 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -417,9 +417,9 @@ static void zswap_entry_put(struct zswap_tree *tree, { int refcount = --entry->refcount; - BUG_ON(refcount < 0); + WARN_ON_ONCE(refcount < 0); if (refcount == 0) { - zswap_rb_erase(&tree->rbroot, entry); + WARN_ON_ONCE(!RB_EMPTY_NODE(&entry->rbnode)); zswap_free_entry(entry); } } From 98804a944a63237814257dd149a5b04d6b93489c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 27 Jul 2023 12:22:25 -0400 Subject: [PATCH 253/489] mm: zswap: kill zswap_get_swap_cache_page() The __read_swap_cache_async() interface isn't more difficult to understand than what the helper abstracts. Save the indirection and a level of indentation for the primary work of the writeback func. Link: https://lkml.kernel.org/r/20230727162343.1415598-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Yosry Ahmed Cc: Vitaly Wool Cc: Barry Song Cc: Seth Jennings Cc: Domenico Cerasuolo Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 142 ++++++++++++++++++++--------------------------------- 1 file changed, 53 insertions(+), 89 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index ea921b25c245b5..8b6b1bc8a5f2f2 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1039,43 +1039,6 @@ static int zswap_enabled_param_set(const char *val, /********************************* * writeback code **********************************/ -/* return enum for zswap_get_swap_cache_page */ -enum zswap_get_swap_ret { - ZSWAP_SWAPCACHE_NEW, - ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_FAIL, -}; - -/* - * zswap_get_swap_cache_page - * - * This is an adaption of read_swap_cache_async() - * - * This function tries to find a page with the given swap entry - * in the swapper_space address space (the swap cache). If the page - * is found, it is returned in retpage. Otherwise, a page is allocated, - * added to the swap cache, and returned in retpage. - * - * If success, the swap cache page is returned in retpage - * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache - * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, - * the new page is added to swapcache and locked - * Returns ZSWAP_SWAPCACHE_FAIL on error - */ -static int zswap_get_swap_cache_page(swp_entry_t entry, - struct page **retpage) -{ - bool page_was_allocated; - - *retpage = __read_swap_cache_async(entry, GFP_KERNEL, - NULL, 0, &page_was_allocated); - if (page_was_allocated) - return ZSWAP_SWAPCACHE_NEW; - if (!*retpage) - return ZSWAP_SWAPCACHE_FAIL; - return ZSWAP_SWAPCACHE_EXIST; -} - /* * Attempts to free an entry by adding a page to the swap cache, * decompressing the entry data into the page, and issuing a @@ -1096,7 +1059,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); - + bool page_was_allocated; u8 *src, *tmp = NULL; unsigned int dlen; int ret; @@ -1111,65 +1074,66 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* try to allocate swap cache page */ - switch (zswap_get_swap_cache_page(swpentry, &page)) { - case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, + &page_was_allocated); + if (!page) { ret = -ENOMEM; goto fail; + } - case ZSWAP_SWAPCACHE_EXIST: - /* page is already in the swap cache, ignore for now */ + /* Found an existing page, we raced with load/swapin */ + if (!page_was_allocated) { put_page(page); ret = -EEXIST; goto fail; + } - case ZSWAP_SWAPCACHE_NEW: /* page is locked */ - /* - * Having a local reference to the zswap entry doesn't exclude - * swapping from invalidating and recycling the swap slot. Once - * the swapcache is secured against concurrent swapping to and - * from the slot, recheck that the entry is still current before - * writing. - */ - spin_lock(&tree->lock); - if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { - spin_unlock(&tree->lock); - delete_from_swap_cache(page_folio(page)); - ret = -ENOMEM; - goto fail; - } + /* + * Page is locked, and the swapcache is now secured against + * concurrent swapping to and from the slot. Verify that the + * swap entry hasn't been invalidated and recycled behind our + * backs (our zswap_entry reference doesn't prevent that), to + * avoid overwriting a new swap page with old compressed data. + */ + spin_lock(&tree->lock); + if (zswap_rb_search(&tree->rbroot, swp_offset(entry->swpentry)) != entry) { spin_unlock(&tree->lock); + delete_from_swap_cache(page_folio(page)); + ret = -ENOMEM; + goto fail; + } + spin_unlock(&tree->lock); - /* decompress */ - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); - dlen = PAGE_SIZE; + /* decompress */ + acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + dlen = PAGE_SIZE; - src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); - if (!zpool_can_sleep_mapped(pool)) { - memcpy(tmp, src, entry->length); - src = tmp; - zpool_unmap_handle(pool, entry->handle); - } + src = zpool_map_handle(pool, entry->handle, ZPOOL_MM_RO); + if (!zpool_can_sleep_mapped(pool)) { + memcpy(tmp, src, entry->length); + src = tmp; + zpool_unmap_handle(pool, entry->handle); + } - mutex_lock(acomp_ctx->mutex); - sg_init_one(&input, src, entry->length); - sg_init_table(&output, 1); - sg_set_page(&output, page, PAGE_SIZE, 0); - acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); - ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); - dlen = acomp_ctx->req->dlen; - mutex_unlock(acomp_ctx->mutex); - - if (!zpool_can_sleep_mapped(pool)) - kfree(tmp); - else - zpool_unmap_handle(pool, entry->handle); + mutex_lock(acomp_ctx->mutex); + sg_init_one(&input, src, entry->length); + sg_init_table(&output, 1); + sg_set_page(&output, page, PAGE_SIZE, 0); + acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen); + ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait); + dlen = acomp_ctx->req->dlen; + mutex_unlock(acomp_ctx->mutex); + + if (!zpool_can_sleep_mapped(pool)) + kfree(tmp); + else + zpool_unmap_handle(pool, entry->handle); - BUG_ON(ret); - BUG_ON(dlen != PAGE_SIZE); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); - /* page is up to date */ - SetPageUptodate(page); - } + /* page is up to date */ + SetPageUptodate(page); /* move it to the tail of the inactive list after end_writeback */ SetPageReclaim(page); @@ -1180,16 +1144,16 @@ static int zswap_writeback_entry(struct zswap_entry *entry, zswap_written_back_pages++; return ret; + fail: if (!zpool_can_sleep_mapped(pool)) kfree(tmp); /* - * if we get here due to ZSWAP_SWAPCACHE_EXIST - * a load may be happening concurrently. - * it is safe and okay to not free the entry. - * it is also okay to return !0 - */ + * If we get here because the page is already in swapcache, a + * load may be happening concurrently. It is safe and okay to + * not free the entry. It is also okay to return !0. + */ return ret; } From 5d241789dfe1c0e5c9b4eb4ae6e48590964b4976 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 27 Jul 2023 19:59:34 +0800 Subject: [PATCH 254/489] mm/memcg: fix obsolete function name in mem_cgroup_protection() Commit 45c7f7e1ef17 ("mm, memcg: decouple e{low,min} state mutations from protection checks") changed the function name but not the corresponding comment. Link: https://lkml.kernel.org/r/20230727115934.657787-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 058fb748e12844..419e001a02e401 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -582,7 +582,7 @@ static inline void mem_cgroup_protection(struct mem_cgroup *root, /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because - * mem_cgroup_protected calculation is not robust enough to keep + * mem_cgroup_calculate_protection is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) From 6e412203eeae68b599fb0a0722961e68f90322df Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 27 Jul 2023 09:55:58 +0800 Subject: [PATCH 255/489] mm/memory.c: fix some kernel-doc comments Add description of @mas and @tree_end, remove @mt in unmap_vmas(). to silence the warnings: mm/memory.c:1837: warning: Function parameter or member 'mas' not described in 'unmap_vmas' mm/memory.c:1837: warning: Function parameter or member 'tree_end' not described in 'unmap_vmas' mm/memory.c:1837: warning: Excess function parameter 'mt' description in 'unmap_vmas' Link: https://lkml.kernel.org/r/20230727015558.69554-1-yang.lee@linux.alibaba.com Signed-off-by: Yang Li Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5996 Cc: Liam Howlett Signed-off-by: Andrew Morton --- mm/memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index f06266464208df..1113ee625a94f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1693,10 +1693,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather - * @mt: the maple tree + * @mas: the maple state * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping + * @tree_end: The maximum index to check * @mm_wr_locked: lock flag * * Unmap all pages in the vma list. From 5d7800d9cb9ad1cc98c414036f03cc029508d1c5 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 27 Jul 2023 09:16:10 +0800 Subject: [PATCH 256/489] mm: kmsan: use helper function page_size() Patch series "minor cleanups for kmsan". Use helper function and macros to improve code readability. No functional modification involved. This patch (of 3): Use function page_size() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230727011612.2721843-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20230727011612.2721843-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Cc: Marco Elver Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/kmsan/hooks.c | 2 +- mm/kmsan/shadow.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index ec0da72e65aa09..4e3c3e60ba9710 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -117,7 +117,7 @@ void kmsan_kfree_large(const void *ptr) page = virt_to_head_page((void *)ptr); KMSAN_WARN_ON(ptr != page_address(page)); kmsan_internal_poison_memory((void *)ptr, - PAGE_SIZE << compound_order(page), + page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index b8bb95eea5e3de..c7de991f6d7ffe 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -210,7 +210,7 @@ void kmsan_free_page(struct page *page, unsigned int order) return; kmsan_enter_runtime(); kmsan_internal_poison_memory(page_address(page), - PAGE_SIZE << compound_order(page), + page_size(page), GFP_KERNEL, KMSAN_POISON_CHECK | KMSAN_POISON_FREE); kmsan_leave_runtime(); From 4852a80524937db8215406cf2b2431b14f320000 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 27 Jul 2023 09:16:11 +0800 Subject: [PATCH 257/489] mm: kmsan: use helper macro offset_in_page() Use helper macro offset_in_page() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230727011612.2721843-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Cc: Marco Elver Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/kmsan/hooks.c | 2 +- mm/kmsan/shadow.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 4e3c3e60ba9710..5d6e2dee5692a3 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -339,7 +339,7 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, * internal KMSAN checks. */ while (size > 0) { - page_offset = addr % PAGE_SIZE; + page_offset = offset_in_page(addr); to_go = min(PAGE_SIZE - page_offset, (u64)size); kmsan_handle_dma_page((void *)addr, to_go, dir); addr += to_go; diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index c7de991f6d7ffe..966994268a017e 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -145,7 +145,7 @@ void *kmsan_get_metadata(void *address, bool is_origin) return NULL; if (!page_has_metadata(page)) return NULL; - off = addr % PAGE_SIZE; + off = offset_in_page(addr); return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; } From 108c3dc6cd3dbe36824469b7ea860337978e0439 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Thu, 27 Jul 2023 09:16:12 +0800 Subject: [PATCH 258/489] mm: kmsan: use helper macros PAGE_ALIGN and PAGE_ALIGN_DOWN Use helper macros PAGE_ALIGN and PAGE_ALIGN_DOWN to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230727011612.2721843-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Cc: Marco Elver Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/kmsan/shadow.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 966994268a017e..87318f9170f197 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -281,8 +281,8 @@ void __init kmsan_init_alloc_meta_for_range(void *start, void *end) struct page *page; u64 size; - start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); - size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + start = (void *)PAGE_ALIGN_DOWN((u64)start); + size = PAGE_ALIGN((u64)end - (u64)start); shadow = memblock_alloc(size, PAGE_SIZE); origin = memblock_alloc(size, PAGE_SIZE); for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { From 866ff80176aa1f9c0ba65f2164cf608c5cde4851 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 27 Jul 2023 21:39:44 +0100 Subject: [PATCH 259/489] mm: improve the comment in isolate_migratepages_block() A recent patch shows that not everybody understands that "stabilise the mapping" really means "prevent the mapping from being freed", so change the wording to hopefully make that more clear. Link: https://lkml.kernel.org/r/ZMLWEB4m3zvX6SBN@casper.infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton --- mm/compaction.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index e6ac0ef4c178f1..c4d3a3129fd535 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1100,13 +1100,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, bool migrate_dirty; /* - * Only pages without mappings or that have a - * ->migrate_folio callback are possible to migrate - * without blocking. However, we can be racing with - * truncation so it's necessary to lock the page - * to stabilise the mapping as truncation holds - * the page lock until after the page is removed - * from the page cache. + * Only folios without mappings or that have + * a ->migrate_folio callback are possible to + * migrate without blocking. However, we may + * be racing with truncation, which can free + * the mapping. Truncation holds the folio lock + * until after the folio is removed from the page + * cache so holding it ourselves is sufficient. */ if (!folio_trylock(folio)) goto isolate_fail_put; From e7ee3f9791f5601fc032b222a70a02b9798784be Mon Sep 17 00:00:00 2001 From: Levi Yun Date: Fri, 28 Jul 2023 06:21:57 +0900 Subject: [PATCH 260/489] damon: use pmdp_get instead of drectly dereferencing pmd As ptep_get, Use the pmdp_get wrapper when we accessing pmdval instead of directly dereferencing pmd. Link: https://lkml.kernel.org/r/20230727212157.2985025-1-ppbuk5246@gmail.com Signed-off-by: Levi Yun Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/damon/paddr.c | 2 +- mm/damon/vaddr.c | 23 +++++++++++++++-------- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index e940802a15a412..ac1c3fa80f9847 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -54,7 +54,7 @@ void damon_ptep_mkold(pte_t *pte, struct vm_area_struct *vma, unsigned long addr void damon_pmdp_mkold(pmd_t *pmd, struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - struct folio *folio = damon_get_folio(pmd_pfn(*pmd)); + struct folio *folio = damon_get_folio(pmd_pfn(pmdp_get(pmd))); if (!folio) return; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 40801e38fcf0e5..909db25efb35ee 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -94,7 +94,7 @@ static bool __damon_pa_young(struct folio *folio, struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, addr); } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - *accessed = pmd_young(*pvmw.pmd) || + *accessed = pmd_young(pmdp_get(pvmw.pmd)) || !folio_test_idle(folio) || mmu_notifier_test_young(vma->vm_mm, addr); #else diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 2fcc9731528ac0..d01cc46f4bf420 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -301,16 +301,19 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { pte_t *pte; + pmd_t pmde; spinlock_t *ptl; - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(pmdp_get(pmd))) { ptl = pmd_lock(walk->mm, pmd); - if (!pmd_present(*pmd)) { + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde)) { spin_unlock(ptl); return 0; } - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(pmde)) { damon_pmdp_mkold(pmd, walk->vma, addr); spin_unlock(ptl); return 0; @@ -439,21 +442,25 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge(*pmd)) { + if (pmd_trans_huge(pmdp_get(pmd))) { + pmd_t pmde; + ptl = pmd_lock(walk->mm, pmd); - if (!pmd_present(*pmd)) { + pmde = pmdp_get(pmd); + + if (!pmd_present(pmde)) { spin_unlock(ptl); return 0; } - if (!pmd_trans_huge(*pmd)) { + if (!pmd_trans_huge(pmde)) { spin_unlock(ptl); goto regular_page; } - folio = damon_get_folio(pmd_pfn(*pmd)); + folio = damon_get_folio(pmd_pfn(pmde)); if (!folio) goto huge_out; - if (pmd_young(*pmd) || !folio_test_idle(folio) || + if (pmd_young(pmde) || !folio_test_idle(folio) || mmu_notifier_test_young(walk->mm, addr)) priv->young = true; From c456832e6a8d8bcdfde4e8b3f66895aaffbd2832 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 17 Jul 2023 19:32:25 +0800 Subject: [PATCH 261/489] mm/page_poison: remove unused page_ext.h from page_poison Patch series "minor cleanups to page_ext header". No page_ext function or structure is used in page_poison. Just remove page_ext header from page_poison. Link: https://lkml.kernel.org/r/20230717113227.1897173-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230717113227.1897173-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Signed-off-by: Andrew Morton --- mm/page_poison.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_poison.c b/mm/page_poison.c index 98438985e1ed9c..b4f456437b7e5d 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include From c6493f4bd789eafc918a5e210e80256e2284a7c0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 17 Jul 2023 19:32:26 +0800 Subject: [PATCH 262/489] mm/vmstat: remove unused page_ext.h from vmstat No page_ext function or structure is used in vmstat. Just remove page_ext header from vmstat. Link: https://lkml.kernel.org/r/20230717113227.1897173-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Signed-off-by: Andrew Morton --- mm/vmstat.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index b731d57996c522..00e81e99c6ee24 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include From 67311a36e5e1e56dfa7264c93db759b18217e114 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Mon, 17 Jul 2023 19:32:27 +0800 Subject: [PATCH 263/489] mm/page_ext: move page_ext_operations definition under CONFIG_PAGE_EXTENSION page_ext_operations should only be defined when CONFIG_PAGE_EXTENSION is enabled. Besides, this may detect missing reliance on CONFIG_PAGE_EXTENSION from future Page Extension clients at compile time. Link: https://lkml.kernel.org/r/20230717113227.1897173-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 658d8d39a10e33..be98564191e676 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -8,6 +8,7 @@ struct pglist_data; +#ifdef CONFIG_PAGE_EXTENSION /** * struct page_ext_operations - per page_ext client operations * @offset: Offset to the client's data within page_ext. Offset is returned to @@ -29,8 +30,6 @@ struct page_ext_operations { bool need_shared_flags; }; -#ifdef CONFIG_PAGE_EXTENSION - /* * The page_ext_flags users must set need_shared_flags to true. */ From edb72f4e4fc2a41bb2dc313ac377d16b78d61dea Mon Sep 17 00:00:00 2001 From: Ayush Jain Date: Fri, 28 Jul 2023 22:11:02 +0530 Subject: [PATCH 264/489] selftests: mm: add KSM_MERGE_TIME tests Add KSM_MERGE_TIME and KSM_MERGE_TIME_HUGE_PAGES tests with size of 100. ./run_vmtests.sh -t ksm ----------------------------- running ./ksm_tests -H -s 100 ----------------------------- Number of normal pages: 0 Number of huge pages: 50 Total size: 100 MiB Total time: 0.399844662 s Average speed: 250.097 MiB/s [PASS] ----------------------------- running ./ksm_tests -P -s 100 ----------------------------- Total size: 100 MiB Total time: 0.451931496 s Average speed: 221.272 MiB/s [PASS] Link: https://lkml.kernel.org/r/20230728164102.4655-1-ayush.jain3@amd.com Signed-off-by: Ayush Jain Reviewed-by: David Hildenbrand Cc: Stefan Roesch Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/run_vmtests.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index 6de90c0adf92ce..3e2bc818d566f6 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -305,6 +305,10 @@ CATEGORY="madv_populate" run_test ./madv_populate CATEGORY="memfd_secret" run_test ./memfd_secret +# KSM KSM_MERGE_TIME_HUGE_PAGES test with size of 100 +CATEGORY="ksm" run_test ./ksm_tests -H -s 100 +# KSM KSM_MERGE_TIME test with size of 100 +CATEGORY="ksm" run_test ./ksm_tests -P -s 100 # KSM MADV_MERGEABLE test with 10 identical pages CATEGORY="ksm" run_test ./ksm_tests -M -p 10 # KSM unmerge test From 11250fd12eb8a58205e69ea36f19fa8c084afb62 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 28 Jul 2023 13:00:40 +0800 Subject: [PATCH 265/489] mm: factor out VMA stack and heap checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: convert to vma_is_initial_heap/stack()", v3. Add vma_is_initial_stack() and vma_is_initial_heap() helpers and use them to simplify code. This patch (of 4): Factor out VMA stack and heap checks and name them vma_is_initial_stack() and vma_is_initial_heap() for general use. Link: https://lkml.kernel.org/r/20230728050043.59880-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20230728050043.59880-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Christian Göttsche Cc: Alex Deucher Cc: Arnaldo Carvalho de Melo Cc: Christian Göttsche Cc: Christian König Cc: Daniel Vetter Cc: David Airlie Cc: Eric Paris Cc: Felix Kuehling Cc: "Pan, Xinhui" Cc: Paul Moore Cc: Stephen Smalley Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 24 ++++-------------------- fs/proc/task_nommu.c | 15 +-------------- include/linux/mm.h | 25 +++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 507cd4e59d074d..bf25178ae66a93 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -236,21 +236,6 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -/* - * Indicate if the VMA is a stack for the given task; for - * /proc/PID/maps that is the stack of the main task. - */ -static int is_stack(struct vm_area_struct *vma) -{ - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack; -} - static void show_vma_header_prefix(struct seq_file *m, unsigned long start, unsigned long end, vm_flags_t flags, unsigned long long pgoff, @@ -327,13 +312,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (vma->vm_start <= mm->brk && - vma->vm_end >= mm->start_brk) { + if (vma_is_initial_heap(vma)) { name = "[heap]"; goto done; } - if (is_stack(vma)) { + if (vma_is_initial_stack(vma)) { name = "[stack]"; goto done; } @@ -1971,9 +1955,9 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); seq_file_path(m, file, "\n\t= "); - } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); - } else if (is_stack(vma)) { + } else if (vma_is_initial_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 2c8b6226598144..a8ac0dd8041ebc 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -121,19 +121,6 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct vm_area_struct *vma) -{ - struct mm_struct *mm = vma->vm_mm; - - /* - * We make no effort to guess what a given thread considers to be - * its "stack". It's not even well-defined for programs written - * languages like Go. - */ - return vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack; -} - /* * display a single VMA to a sequenced file */ @@ -171,7 +158,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(vma)) { + } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); } diff --git a/include/linux/mm.h b/include/linux/mm.h index b2520dd555f911..f64d1de3af09d7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -833,6 +833,31 @@ static inline bool vma_is_anonymous(struct vm_area_struct *vma) return !vma->vm_ops; } +/* + * Indicate if the VMA is a heap for the given task; for + * /proc/PID/maps that is the heap of the main task. + */ +static inline bool vma_is_initial_heap(const struct vm_area_struct *vma) +{ + return vma->vm_start <= vma->vm_mm->brk && + vma->vm_end >= vma->vm_mm->start_brk; +} + +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static inline bool vma_is_initial_stack(const struct vm_area_struct *vma) +{ + /* + * We make no effort to guess what a given thread considers to be + * its "stack". It's not even well-defined for programs written + * languages like Go. + */ + return vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; +} + static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); From f7992bfaf3e35059f26a7be13f42eefc1050ced9 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 28 Jul 2023 13:00:41 +0800 Subject: [PATCH 266/489] drm/amdkfd: use vma_is_initial_stack() and vma_is_initial_heap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the helpers to simplify code. Link: https://lkml.kernel.org/r/20230728050043.59880-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Reviewed-by: Felix Kuehling Acked-by: Peter Zijlstra (Intel) Cc: Alex Deucher Cc: "Christian König" Cc: "Pan, Xinhui" Cc: David Airlie Cc: Daniel Vetter Cc: Arnaldo Carvalho de Melo Cc: Christian Göttsche Cc: Eric Paris Cc: Paul Moore Cc: Stephen Smalley Signed-off-by: Andrew Morton --- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 5ff1a5a89d9681..0b7bfbd0cb66d2 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2621,10 +2621,7 @@ svm_range_get_range_boundaries(struct kfd_process *p, int64_t addr, return -EFAULT; } - *is_heap_stack = (vma->vm_start <= vma->vm_mm->brk && - vma->vm_end >= vma->vm_mm->start_brk) || - (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack); + *is_heap_stack = vma_is_initial_heap(vma) || vma_is_initial_stack(vma); start_limit = max(vma->vm_start >> PAGE_SHIFT, (unsigned long)ALIGN_DOWN(addr, 2UL << 8)); From 68df1baf158fddc07b6f0333e4c81fe1ccecd6ff Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 28 Jul 2023 13:00:42 +0800 Subject: [PATCH 267/489] selinux: use vma_is_initial_stack() and vma_is_initial_heap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the helpers to simplify code. Link: https://lkml.kernel.org/r/20230728050043.59880-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Paul Moore Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Stephen Smalley Cc: Eric Paris Cc: Alex Deucher Cc: Arnaldo Carvalho de Melo Cc: Christian Göttsche Cc: "Christian König" Cc: Daniel Vetter Cc: David Airlie Cc: Felix Kuehling Cc: "Pan, Xinhui" Signed-off-by: Andrew Morton --- security/selinux/hooks.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index d06e350fedee5f..ee8575540a8efc 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3762,13 +3762,10 @@ static int selinux_file_mprotect(struct vm_area_struct *vma, if (default_noexec && (prot & PROT_EXEC) && !(vma->vm_flags & VM_EXEC)) { int rc = 0; - if (vma->vm_start >= vma->vm_mm->start_brk && - vma->vm_end <= vma->vm_mm->brk) { + if (vma_is_initial_heap(vma)) { rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECHEAP, NULL); - } else if (!vma->vm_file && - ((vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) || + } else if (!vma->vm_file && (vma_is_initial_stack(vma) || vma_is_stack_for_current(vma))) { rc = avc_has_perm(sid, sid, SECCLASS_PROCESS, PROCESS__EXECSTACK, NULL); From 549f5c771e1be6985c29d5f1e0bbba11a8897ec8 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 28 Jul 2023 13:00:43 +0800 Subject: [PATCH 268/489] perf/core: use vma_is_initial_stack() and vma_is_initial_heap() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the helpers to simplify code, also kill unneeded goto cpy_name. Link: https://lkml.kernel.org/r/20230728050043.59880-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Acked-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Alex Deucher Cc: Christian Göttsche Cc: "Christian König" Cc: Daniel Vetter Cc: David Airlie Cc: Eric Paris Cc: Felix Kuehling Cc: "Pan, Xinhui" Cc: Paul Moore Cc: Stephen Smalley Signed-off-by: Andrew Morton --- kernel/events/core.c | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 78ae7b6f90fdbf..e78751fee7fe27 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8631,7 +8631,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) unsigned int size; char tmp[16]; char *buf = NULL; - char *name; + char *name = NULL; if (vma->vm_flags & VM_READ) prot |= PROT_READ; @@ -8678,29 +8678,18 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) goto got_name; } else { - if (vma->vm_ops && vma->vm_ops->name) { + if (vma->vm_ops && vma->vm_ops->name) name = (char *) vma->vm_ops->name(vma); - if (name) - goto cpy_name; + if (!name) + name = (char *)arch_vma_name(vma); + if (!name) { + if (vma_is_initial_heap(vma)) + name = "[heap]"; + else if (vma_is_initial_stack(vma)) + name = "[stack]"; + else + name = "//anon"; } - - name = (char *)arch_vma_name(vma); - if (name) - goto cpy_name; - - if (vma->vm_start <= vma->vm_mm->start_brk && - vma->vm_end >= vma->vm_mm->brk) { - name = "[heap]"; - goto cpy_name; - } - if (vma->vm_start <= vma->vm_mm->start_stack && - vma->vm_end >= vma->vm_mm->start_stack) { - name = "[stack]"; - goto cpy_name; - } - - name = "//anon"; - goto cpy_name; } cpy_name: From ebddd111fcd13fefd7350f77201dfc5605672909 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 1 Aug 2023 20:37:23 +0800 Subject: [PATCH 269/489] mm/page_alloc: avoid unneeded alike_pages calculation When free_pages is 0, alike_pages is not used. So alike_pages calculation can be avoided by checking free_pages early to save cpu cycles. Also fix typo 'comparable'. It should be 'compatible' here. Link: https://lkml.kernel.org/r/20230801123723.2225543-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1eb3864e1dbc70..8b17dcbb925da9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1833,6 +1833,10 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, free_pages = move_freepages_block(zone, page, start_type, &movable_pages); + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; + /* * Determine how many pages are compatible with our allocation. * For movable allocation, it's the number of movable pages which @@ -1854,14 +1858,9 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, else alike_pages = 0; } - - /* moving whole block can fail due to zone boundary conditions */ - if (!free_pages) - goto single_page; - /* * If a sufficient number of pages in the block are either free or of - * comparable migratability as our allocation, claim the whole block. + * compatible migratability as our allocation, claim the whole block. */ if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) From 6a718bd2ed4a589e7e50e7d028d24e087139dd03 Mon Sep 17 00:00:00 2001 From: Yicong Yang Date: Tue, 1 Aug 2023 20:42:03 +0800 Subject: [PATCH 270/489] arm64: tlbflush: add some comments for TLB batched flushing Add comments for arch_flush_tlb_batched_pending() and arch_tlbbatch_flush() to illustrate why only a DSB is needed. Link: https://lkml.kernel.org/r/20230801124203.62164-1-yangyicong@huawei.com Cc: Catalin Marinas Signed-off-by: Yicong Yang Reviewed-by: Alistair Popple Reviewed-by: Catalin Marinas Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlbflush.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 84a05a0bd2b67a..55b50e1d4a84f2 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -304,11 +304,26 @@ static inline void arch_tlbbatch_add_pending(struct arch_tlbflush_unmap_batch *b __flush_tlb_page_nosync(mm, uaddr); } +/* + * If mprotect/munmap/etc occurs during TLB batched flushing, we need to + * synchronise all the TLBI issued with a DSB to avoid the race mentioned in + * flush_tlb_batched_pending(). + */ static inline void arch_flush_tlb_batched_pending(struct mm_struct *mm) { dsb(ish); } +/* + * To support TLB batched flush for multiple pages unmapping, we only send + * the TLBI for each page in arch_tlbbatch_add_pending() and wait for the + * completion at the end in arch_tlbbatch_flush(). Since we've already issued + * TLBI for each page so only a DSB is needed to synchronise its effect on the + * other CPUs. + * + * This will save the time waiting on DSB comparing issuing a TLBI;DSB sequence + * for each page. + */ static inline void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { dsb(ish); From ca39c5e7d10f09983293f064caa447690cb3ec92 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 1 Aug 2023 20:43:59 +0800 Subject: [PATCH 271/489] mm/memcg: update obsolete comment above parent_mem_cgroup() Since commit bef8620cd8e0 ("mm: memcg: deprecate the non-hierarchical mode"), use_hierarchy is already deprecated. And it's further removed via commit 9d9d341df4d5 ("cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy"). Update corresponding comment. Link: https://lkml.kernel.org/r/20230801124359.2266860-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 419e001a02e401..163004ae334966 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -860,8 +860,7 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * - * Returns the parent memcg, or NULL if this is the root or the memory - * controller is in legacy no-hierarchy mode. + * Returns the parent memcg, or NULL if this is the root. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { From 2a158e956b98e0c5f37b5ce9953048654348ad6b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Sat, 29 Jul 2023 20:37:33 +0000 Subject: [PATCH 272/489] mm/damon/core-test: add a test for damos_new_filter() damos_new_filter() was having a bug that not initializing ->list field of the returning damos_filter struct, which results in access to uninitialized memory. Add a unit test for the function. Link: https://lkml.kernel.org/r/20230729203733.38949-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index bb07721909e1d5..4bddbfe243c3a0 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -341,6 +341,18 @@ static void damon_test_set_attrs(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_set_attrs(c, &invalid_attrs), -EINVAL); } +static void damos_test_new_filter(struct kunit *test) +{ + struct damos_filter *filter; + + filter = damos_new_filter(DAMOS_FILTER_TYPE_ANON, true); + KUNIT_EXPECT_EQ(test, filter->type, DAMOS_FILTER_TYPE_ANON); + KUNIT_EXPECT_EQ(test, filter->matching, true); + KUNIT_EXPECT_PTR_EQ(test, filter->list.prev, &filter->list); + KUNIT_EXPECT_PTR_EQ(test, filter->list.next, &filter->list); + damos_destroy_filter(filter); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -353,6 +365,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_set_regions), KUNIT_CASE(damon_test_update_monitoring_result), KUNIT_CASE(damon_test_set_attrs), + KUNIT_CASE(damos_test_new_filter), {}, }; From 0388536ac29104a478c79b3869541524caec28eb Mon Sep 17 00:00:00 2001 From: Efly Young Date: Fri, 21 Jul 2023 09:41:16 +0800 Subject: [PATCH 273/489] mm:vmscan: fix inaccurate reclaim during proactive reclaim Before commit f53af4285d77 ("mm: vmscan: fix extreme overreclaim and swap floods"), proactive reclaim will extreme overreclaim sometimes. But proactive reclaim still inaccurate and some extent overreclaim. Problematic case is easy to construct. Allocate lots of anonymous memory (e.g., 20G) in a memcg, then swapping by writing memory.recalim and there is a certain probability of overreclaim. For example, request 1G by writing memory.reclaim will eventually reclaim 1.7G or other values more than 1G. The reason is that reclaimer may have already reclaimed part of requested memory in one loop, but before adjust sc->nr_to_reclaim in outer loop, call shrink_lruvec() again will still follow the current sc->nr_to_reclaim to work. It will eventually lead to overreclaim. In theory, the amount of reclaimed would be in [request, 2 * request). Reclaimer usually tends to reclaim more than request. But either direct or kswapd reclaim have much smaller nr_to_reclaim targets, so it is less noticeable and not have much impact. Proactive reclaim can usually come in with a larger value, so the error is difficult to ignore. Considering proactive reclaim is usually low frequency, handle the batching into smaller chunks is a better approach. Link: https://lkml.kernel.org/r/20230721014116.3388-1-yangyifei03@kuaishou.com Signed-off-by: Efly Young Suggested-by: Johannes Weiner Acked-by: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 062d925336cbdb..56abc4f426f4dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6686,8 +6686,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, lru_add_drain_all(); reclaimed = try_to_free_mem_cgroup_pages(memcg, - nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, reclaim_options); + min(nr_to_reclaim - nr_reclaimed, SWAP_CLUSTER_MAX), + GFP_KERNEL, reclaim_options); if (!reclaimed && !nr_retries--) return -EAGAIN; From 669281ee7ef731fb5204df9d948669bf32a5e68d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 1 Aug 2023 19:56:02 -0700 Subject: [PATCH 274/489] Multi-gen LRU: fix per-zone reclaim MGLRU has a LRU list for each zone for each type (anon/file) in each generation: long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; The min_seq (oldest generation) can progress independently for each type but the max_seq (youngest generation) is shared for both anon and file. This is to maintain a common frame of reference. In order for eviction to advance the min_seq of a type, all the per-zone lists in the oldest generation of that type must be empty. The eviction logic only considers pages from eligible zones for eviction or promotion. scan_folios() { ... for (zone = sc->reclaim_idx; zone >= 0; zone--) { ... sort_folio(); // Promote ... isolate_folio(); // Evict } ... } Consider the system has the movable zone configured and default 4 generations. The current state of the system is as shown below (only illustrating one type for simplicity): Type: ANON Zone DMA32 Normal Movable Device Gen 0 0 0 4GB 0 Gen 1 0 1GB 1MB 0 Gen 2 1MB 4GB 1MB 0 Gen 3 1MB 1MB 1MB 0 Now consider there is a GFP_KERNEL allocation request (eligible zone index <= Normal), evict_folios() will return without doing any work since there are no pages to scan in the eligible zones of the oldest generation. Reclaim won't make progress until triggered from a ZONE_MOVABLE allocation request; which may not happen soon if there is a lot of free memory in the movable zone. This can lead to OOM kills, although there is 1GB pages in the Normal zone of Gen 1 that we have not yet tried to reclaim. This issue is not seen in the conventional active/inactive LRU since there are no per-zone lists. If there are no (not enough) folios to scan in the eligible zones, move folios from ineligible zone (zone_index > reclaim_index) to the next generation. This allows for the progression of min_seq and reclaiming from the next generation (Gen 1). Qualcomm, Mediatek and raspberrypi [1] discovered this issue independently. [1] https://github.com/raspberrypi/linux/issues/5395 Link: https://lkml.kernel.org/r/20230802025606.346758-1-kaleshsingh@google.com Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") Signed-off-by: Kalesh Singh Reported-by: Charan Teja Kalla Reported-by: Lecopzer Chen Tested-by: AngeloGioacchino Del Regno [mediatek] Tested-by: Charan Teja Kalla Cc: Yu Zhao Cc: Barry Song Cc: Brian Geffon Cc: Jan Alexander Steffens (heftig) Cc: Matthias Brugger Cc: Oleksandr Natalenko Cc: Qi Zheng Cc: Steven Barrett Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: Aneesh Kumar K V Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 4039620d30fe4a..489a4fc7d9b113 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4889,7 +4889,8 @@ static int lru_gen_memcg_seg(struct lruvec *lruvec) * the eviction ******************************************************************************/ -static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc, + int tier_idx) { bool success; int gen = folio_lru_gen(folio); @@ -4939,6 +4940,13 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) return true; } + /* ineligible */ + if (zone > sc->reclaim_idx) { + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + return true; + } + /* waiting for writeback */ if (folio_test_locked(folio) || folio_test_writeback(folio) || (type == LRU_GEN_FILE && folio_test_dirty(folio))) { @@ -4987,7 +4995,8 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, int type, int tier, struct list_head *list) { - int gen, zone; + int i; + int gen; enum vm_event_item item; int sorted = 0; int scanned = 0; @@ -5003,9 +5012,10 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, gen = lru_gen_from_seq(lrugen->min_seq[type]); - for (zone = sc->reclaim_idx; zone >= 0; zone--) { + for (i = MAX_NR_ZONES; i > 0; i--) { LIST_HEAD(moved); int skipped = 0; + int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES; struct list_head *head = &lrugen->folios[gen][type][zone]; while (!list_empty(head)) { @@ -5019,7 +5029,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, scanned += delta; - if (sort_folio(lruvec, folio, tier)) + if (sort_folio(lruvec, folio, sc, tier)) sorted += delta; else if (isolate_folio(lruvec, folio, sc)) { list_add(&folio->lru, list); From bb5e7f234eacf34b65be67ebb3613e3b8cf11b87 Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 1 Aug 2023 19:56:03 -0700 Subject: [PATCH 275/489] Multi-gen LRU: avoid race in inc_min_seq() inc_max_seq() will try to inc_min_seq() if nr_gens == MAX_NR_GENS. This is because the generations are reused (the last oldest now empty generation will become the next youngest generation). inc_min_seq() is retried until successful, dropping the lru_lock and yielding the CPU on each failure, and retaking the lock before trying again: while (!inc_min_seq(lruvec, type, can_swap)) { spin_unlock_irq(&lruvec->lru_lock); cond_resched(); spin_lock_irq(&lruvec->lru_lock); } However, the initial condition that required incrementing the min_seq (nr_gens == MAX_NR_GENS) is not retested. This can change by another call to inc_max_seq() from run_aging() with force_scan=true from the debugfs interface. Since the eviction stalls when the nr_gens == MIN_NR_GENS, avoid unnecessarily incrementing the min_seq by rechecking the number of generations before each attempt. This issue was uncovered in previous discussion on the list by Yu Zhao and Aneesh Kumar [1]. [1] https://lore.kernel.org/linux-mm/CAOUHufbO7CaVm=xjEb1avDhHVvnC8pJmGyKcFf2iY_dpf+zR3w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230802025606.346758-2-kaleshsingh@google.com Fixes: d6c3af7d8a2b ("mm: multi-gen LRU: debugfs interface") Signed-off-by: Kalesh Singh Tested-by: AngeloGioacchino Del Regno [mediatek] Tested-by: Charan Teja Kalla Cc: Yu Zhao Cc: Aneesh Kumar K V Cc: Barry Song Cc: Brian Geffon Cc: Jan Alexander Steffens (heftig) Cc: Lecopzer Chen Cc: Matthias Brugger Cc: Oleksandr Natalenko Cc: Qi Zheng Cc: Steven Barrett Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Cc: Signed-off-by: Andrew Morton --- mm/vmscan.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 489a4fc7d9b113..6eecd291756cde 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4439,7 +4439,7 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) int prev, next; int type, zone; struct lru_gen_folio *lrugen = &lruvec->lrugen; - +restart: spin_lock_irq(&lruvec->lru_lock); VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); @@ -4450,11 +4450,12 @@ static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan) VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap)); - while (!inc_min_seq(lruvec, type, can_swap)) { - spin_unlock_irq(&lruvec->lru_lock); - cond_resched(); - spin_lock_irq(&lruvec->lru_lock); - } + if (inc_min_seq(lruvec, type, can_swap)) + continue; + + spin_unlock_irq(&lruvec->lru_lock); + cond_resched(); + goto restart; } /* From a3235ea2a88b7874204c39ebb20feb712f4dba9d Mon Sep 17 00:00:00 2001 From: Kalesh Singh Date: Tue, 1 Aug 2023 19:56:04 -0700 Subject: [PATCH 276/489] Multi-gen LRU: fix can_swap in lru_gen_look_around() walk->can_swap might be invalid since it's not guaranteed to be initialized for the particular lruvec. Instead deduce it from the folio type (anon/file). Link: https://lkml.kernel.org/r/20230802025606.346758-3-kaleshsingh@google.com Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") Signed-off-by: Kalesh Singh Tested-by: AngeloGioacchino Del Regno [mediatek] Tested-by: Charan Teja Kalla Cc: Yu Zhao Cc: Aneesh Kumar K V Cc: Barry Song Cc: Brian Geffon Cc: Jan Alexander Steffens (heftig) Cc: Lecopzer Chen Cc: Matthias Brugger Cc: Oleksandr Natalenko Cc: Qi Zheng Cc: Steven Barrett Cc: Suleiman Souhlal Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 6eecd291756cde..b4329f93a682e3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4656,6 +4656,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) pte_t *pte = pvmw->pte; unsigned long addr = pvmw->address; struct folio *folio = pfn_folio(pvmw->pfn); + bool can_swap = !folio_is_file_lru(folio); struct mem_cgroup *memcg = folio_memcg(folio); struct pglist_data *pgdat = folio_pgdat(folio); struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); @@ -4704,7 +4705,7 @@ void lru_gen_look_around(struct page_vma_mapped_walk *pvmw) if (!pte_young(ptent)) continue; - folio = get_pfn_folio(pfn, memcg, pgdat, !walk || walk->can_swap); + folio = get_pfn_folio(pfn, memcg, pgdat, can_swap); if (!folio) continue; From b69f92a741405336fb17a8a3d67fc144192fe8e2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:17 +0000 Subject: [PATCH 277/489] mm/damon/sysfs-schemes: implement DAMOS tried total bytes file Patch series "mm/damon/sysfs-schemes: implement DAMOS tried total bytes file". The tried_regions directory of DAMON sysfs interface is useful for retrieving monitoring results snapshot or DAMOS debugging. However, for common use case that need to monitor only the total size of the scheme tried regions (e.g., monitoring working set size), the kernel overhead for directory construction and user overhead for reading the content could be high if the number of monitoring region is not small. This patchset implements DAMON sysfs files for efficient support of the use case. The first patch implements the sysfs file to reduce the user space overhead, and the second patch implements a command for reducing the kernel space overhead. The third patch adds a selftest for the new file, and following two patches update documents. [1] https://lore.kernel.org/damon/20230728201817.70602-1-sj@kernel.org/ This patch (of 5): The tried_regions directory can be used for retrieving the monitoring results snapshot for regions of specific access pattern, by setting the scheme's action as 'stat' and the access pattern as required. While the interface provides every detail of the monitoring results, some use cases including working set size monitoring requires only the total size of the regions. For such cases, users should read all the information and calculate the total size of the regions. However, it could incur high overhead if the number of regions is high. Add a file for retrieving only the information, namely 'total_bytes' file. It allows users to get the total size by reading only the file. Link: https://lkml.kernel.org/r/20230802213222.109841-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230802213222.109841-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 50cf89dcd898b3..6d3462eb31f2d3 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -117,6 +117,7 @@ struct damon_sysfs_scheme_regions { struct kobject kobj; struct list_head regions_list; int nr_regions; + unsigned long total_bytes; }; static struct damon_sysfs_scheme_regions * @@ -128,9 +129,19 @@ damon_sysfs_scheme_regions_alloc(void) regions->kobj = (struct kobject){}; INIT_LIST_HEAD(®ions->regions_list); regions->nr_regions = 0; + regions->total_bytes = 0; return regions; } +static ssize_t total_bytes_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_regions *regions = container_of(kobj, + struct damon_sysfs_scheme_regions, kobj); + + return sysfs_emit(buf, "%lu\n", regions->total_bytes); +} + static void damon_sysfs_scheme_regions_rm_dirs( struct damon_sysfs_scheme_regions *regions) { @@ -148,7 +159,11 @@ static void damon_sysfs_scheme_regions_release(struct kobject *kobj) kfree(container_of(kobj, struct damon_sysfs_scheme_regions, kobj)); } +static struct kobj_attribute damon_sysfs_scheme_regions_total_bytes_attr = + __ATTR_RO_MODE(total_bytes, 0400); + static struct attribute *damon_sysfs_scheme_regions_attrs[] = { + &damon_sysfs_scheme_regions_total_bytes_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_regions); @@ -1648,6 +1663,7 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, return 0; sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; + sysfs_regions->total_bytes += r->ar.end - r->ar.start; region = damon_sysfs_scheme_region_alloc(r); list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; @@ -1678,6 +1694,7 @@ int damon_sysfs_schemes_clear_regions( sysfs_scheme = sysfs_schemes->schemes_arr[schemes_idx++]; damon_sysfs_scheme_regions_rm_dirs( sysfs_scheme->tried_regions); + sysfs_scheme->tried_regions->total_bytes = 0; } return 0; } From 6ad243b83b5094026fdb3171711ddb25246b3d8a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:18 +0000 Subject: [PATCH 278/489] mm/damon/sysfs: implement a command for updating only schemes tried total bytes Using tried_regions/total_bytes file, users can efficiently retrieve the total size of memory regions having specific access pattern. However, DAMON sysfs interface in kernel still populates all the infomration on the tried_regions subdirectories. That means the kernel part overhead for the construction of tried regions directories still exists. To remove the overhead, implement yet another command input for 'state' DAMON sysfs file. Writing the input to the file makes DAMON sysfs interface to update only the total_bytes file. Link: https://lkml.kernel.org/r/20230802213222.109841-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-common.h | 2 +- mm/damon/sysfs-schemes.c | 7 ++++++- mm/damon/sysfs.c | 26 ++++++++++++++++++++------ 3 files changed, 27 insertions(+), 8 deletions(-) diff --git a/mm/damon/sysfs-common.h b/mm/damon/sysfs-common.h index db677eba78fd33..fd482a0639b474 100644 --- a/mm/damon/sysfs-common.h +++ b/mm/damon/sysfs-common.h @@ -47,7 +47,7 @@ void damon_sysfs_schemes_update_stats( int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx); + struct damon_ctx *ctx, bool total_bytes_only); int damon_sysfs_schemes_update_regions_stop(struct damon_ctx *ctx); diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 6d3462eb31f2d3..9a015079f3a46f 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -1635,6 +1635,7 @@ void damon_sysfs_schemes_update_stats( */ static struct damon_sysfs_schemes *damon_sysfs_schemes_for_damos_callback; static int damon_sysfs_schemes_region_idx; +static bool damos_regions_upd_total_bytes_only; /* * DAMON callback that called before damos apply. While this callback is @@ -1664,6 +1665,9 @@ static int damon_sysfs_before_damos_apply(struct damon_ctx *ctx, sysfs_regions = sysfs_schemes->schemes_arr[schemes_idx]->tried_regions; sysfs_regions->total_bytes += r->ar.end - r->ar.start; + if (damos_regions_upd_total_bytes_only) + return 0; + region = damon_sysfs_scheme_region_alloc(r); list_add_tail(®ion->list, &sysfs_regions->regions_list); sysfs_regions->nr_regions++; @@ -1702,10 +1706,11 @@ int damon_sysfs_schemes_clear_regions( /* Called from damon_sysfs_cmd_request_callback under damon_sysfs_lock */ int damon_sysfs_schemes_update_regions_start( struct damon_sysfs_schemes *sysfs_schemes, - struct damon_ctx *ctx) + struct damon_ctx *ctx, bool total_bytes_only) { damon_sysfs_schemes_clear_regions(sysfs_schemes, ctx); damon_sysfs_schemes_for_damos_callback = sysfs_schemes; + damos_regions_upd_total_bytes_only = total_bytes_only; ctx->callback.before_damos_apply = damon_sysfs_before_damos_apply; return 0; } diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 33e1d5c9cb5497..b86ba7b0a9214c 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -999,6 +999,11 @@ enum damon_sysfs_cmd { * files. */ DAMON_SYSFS_CMD_UPDATE_SCHEMES_STATS, + /* + * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: Update + * tried_regions/total_bytes sysfs files for each scheme. + */ + DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES, /* * @DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: Update schemes tried * regions @@ -1021,6 +1026,7 @@ static const char * const damon_sysfs_cmd_strs[] = { "off", "commit", "update_schemes_stats", + "update_schemes_tried_bytes", "update_schemes_tried_regions", "clear_schemes_tried_regions", }; @@ -1206,12 +1212,14 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; struct damon_sysfs_kdamond *kdamond; + enum damon_sysfs_cmd cmd; /* damon_sysfs_schemes_update_regions_stop() might not yet called */ kdamond = damon_sysfs_cmd_request.kdamond; - if (kdamond && damon_sysfs_cmd_request.cmd == - DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS && - ctx == kdamond->damon_ctx) { + cmd = damon_sysfs_cmd_request.cmd; + if (kdamond && ctx == kdamond->damon_ctx && + (cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS || + cmd == DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES)) { damon_sysfs_schemes_update_regions_stop(ctx); mutex_unlock(&damon_sysfs_lock); } @@ -1248,14 +1256,15 @@ static int damon_sysfs_upd_schemes_stats(struct damon_sysfs_kdamond *kdamond) } static int damon_sysfs_upd_schemes_regions_start( - struct damon_sysfs_kdamond *kdamond) + struct damon_sysfs_kdamond *kdamond, bool total_bytes_only) { struct damon_ctx *ctx = kdamond->damon_ctx; if (!ctx) return -EINVAL; return damon_sysfs_schemes_update_regions_start( - kdamond->contexts->contexts_arr[0]->schemes, ctx); + kdamond->contexts->contexts_arr[0]->schemes, ctx, + total_bytes_only); } static int damon_sysfs_upd_schemes_regions_stop( @@ -1332,6 +1341,7 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) { struct damon_sysfs_kdamond *kdamond; static bool damon_sysfs_schemes_regions_updating; + bool total_bytes_only = false; int err = 0; /* avoid deadlock due to concurrent state_store('off') */ @@ -1348,9 +1358,13 @@ static int damon_sysfs_cmd_request_callback(struct damon_ctx *c) case DAMON_SYSFS_CMD_COMMIT: err = damon_sysfs_commit_input(kdamond); break; + case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_BYTES: + total_bytes_only = true; + fallthrough; case DAMON_SYSFS_CMD_UPDATE_SCHEMES_TRIED_REGIONS: if (!damon_sysfs_schemes_regions_updating) { - err = damon_sysfs_upd_schemes_regions_start(kdamond); + err = damon_sysfs_upd_schemes_regions_start(kdamond, + total_bytes_only); if (!err) { damon_sysfs_schemes_regions_updating = true; goto keep_lock_out; From b823cb08e66212f5d90b5eaad59b2371b930e33f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:19 +0000 Subject: [PATCH 279/489] selftests/damon/sysfs: test tried_regions/total_bytes file Update sysfs.sh DAMON selftest for checking existence of 'total_bytes' file under the 'tried_regions' directory of DAMON sysfs interface. Link: https://lkml.kernel.org/r/20230802213222.109841-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index bcd4734ca09432..967e2726754998 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -84,6 +84,7 @@ test_tried_regions() { tried_regions_dir=$1 ensure_dir "$tried_regions_dir" "exist" + ensure_file "$tried_regions_dir/total_bytes" "exist" "400" } test_stats() From e91b5ccf1f1b92cb699c2f8b392cf10598183239 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:20 +0000 Subject: [PATCH 280/489] Docs/ABI/damon: update for tried_regions/total_bytes Update the DAMON ABI document for newly added schemes/.../tried_regions/total_bytes file and the update_schemes_tried_bytes command. Link: https://lkml.kernel.org/r/20230802213222.109841-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 2744f21b5a6b36..3d9aaa1cafa9bd 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -29,8 +29,10 @@ Description: Writing 'on' or 'off' to this file makes the kdamond starts or file updates contents of schemes stats files of the kdamond. Writing 'update_schemes_tried_regions' to the file updates contents of 'tried_regions' directory of every scheme directory - of this kdamond. Writing 'clear_schemes_tried_regions' to the - file removes contents of the 'tried_regions' directory. + of this kdamond. Writing 'update_schemes_tried_bytes' to the + file updates only '.../tried_regions/total_bytes' files of this + kdamond. Writing 'clear_schemes_tried_regions' to the file + removes contents of the 'tried_regions' directory. What: /sys/kernel/mm/damon/admin/kdamonds//pid Date: Mar 2022 @@ -317,6 +319,13 @@ Contact: SeongJae Park Description: Reading this file returns the number of the exceed events of the scheme's quotas. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions/total_bytes +Date: Jul 2023 +Contact: SeongJae Park +Description: Reading this file returns the total amount of memory that + corresponding DAMON-based Operation Scheme's action has tried + to be applied. + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//tried_regions//start Date: Oct 2022 Contact: SeongJae Park From ea7f03a441b58abcc9b619570a628c53e80665c6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:32:21 +0000 Subject: [PATCH 281/489] Docs/admin-guide/mm/damon/usage: update for tried_regions/total_bytes Update the DAMON usage document for newly added schemes/.../tried_regions/total_bytes file and the update_schemes_tried_bytes command. Link: https://lkml.kernel.org/r/20230802213222.109841-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 42 +++++++++++++------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 2d495fa85a0ee9..1859dd6c383477 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -87,7 +87,7 @@ comma (","). :: │ │ │ │ │ │ │ filters/nr_filters │ │ │ │ │ │ │ │ 0/type,matching,memcg_id │ │ │ │ │ │ │ stats/nr_tried,sz_tried,nr_applied,sz_applied,qt_exceeds - │ │ │ │ │ │ │ tried_regions/ + │ │ │ │ │ │ │ tried_regions/total_bytes │ │ │ │ │ │ │ │ 0/start,end,nr_accesses,age │ │ │ │ │ │ │ │ ... │ │ │ │ │ │ ... @@ -127,14 +127,18 @@ in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the user inputs in the sysfs files except ``state`` file again. Writing ``update_schemes_stats`` to ``state`` file updates the contents of stats files for each DAMON-based operation scheme of the kdamond. For details of the -stats, please refer to :ref:`stats section `. Writing -``update_schemes_tried_regions`` to ``state`` file updates the DAMON-based -operation scheme action tried regions directory for each DAMON-based operation -scheme of the kdamond. Writing ``clear_schemes_tried_regions`` to ``state`` -file clears the DAMON-based operating scheme action tried regions directory for -each DAMON-based operation scheme of the kdamond. For details of the -DAMON-based operation scheme action tried regions directory, please refer to -:ref:`tried_regions section `. +stats, please refer to :ref:`stats section `. + +Writing ``update_schemes_tried_regions`` to ``state`` file updates the +DAMON-based operation scheme action tried regions directory for each +DAMON-based operation scheme of the kdamond. Writing +``update_schemes_tried_bytes`` to ``state`` file updates only +``.../tried_regions/total_bytes`` files. Writing +``clear_schemes_tried_regions`` to ``state`` file clears the DAMON-based +operating scheme action tried regions directory for each DAMON-based operation +scheme of the kdamond. For details of the DAMON-based operation scheme action +tried regions directory, please refer to :ref:`tried_regions section +`. If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread. @@ -406,13 +410,21 @@ stats by writing a special keyword, ``update_schemes_stats`` to the relevant schemes//tried_regions/ -------------------------- +This directory initially has one file, ``total_bytes``. + When a special keyword, ``update_schemes_tried_regions``, is written to the -relevant ``kdamonds//state`` file, DAMON creates directories named integer -starting from ``0`` under this directory. Each directory contains files -exposing detailed information about each of the memory region that the -corresponding scheme's ``action`` has tried to be applied under this directory, -during next :ref:`aggregation interval `. The -information includes address range, ``nr_accesses``, and ``age`` of the region. +relevant ``kdamonds//state`` file, DAMON updates the ``total_bytes`` file so +that reading it returns the total size of the scheme tried regions, and creates +directories named integer starting from ``0`` under this directory. Each +directory contains files exposing detailed information about each of the memory +region that the corresponding scheme's ``action`` has tried to be applied under +this directory, during next :ref:`aggregation interval +`. The information includes address range, +``nr_accesses``, and ``age`` of the region. + +Writing ``update_schemes_tried_bytes`` to the relevant ``kdamonds//state`` +file will only update the ``total_bytes`` file, and will not create the +subdirectories. The directories will be removed when another special keyword, ``clear_schemes_tried_regions``, is written to the relevant From ab9bda001b681c293fb72ef21f083adfbcd78028 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:00 +0000 Subject: [PATCH 282/489] mm/damon/core: introduce address range type damos filter Patch series "Extend DAMOS filters for address ranges and DAMON monitoring targets" There are use cases that need to apply DAMOS schemes to specific address ranges or DAMON monitoring targets. NUMA nodes in the physical address space, special memory objects in the virtual address space, and monitoring target specific efficient monitoring results snapshot retrieval could be examples of such use cases. This patchset extends DAMOS filters feature for such cases, by implementing two more filter types, namely address ranges and DAMON monitoring types. Patches sequence ---------------- The first seven patches are for the address ranges based DAMOS filter. The first patch implements the filter feature and expose it via DAMON kernel API. The second patch further expose the feature to users via DAMON sysfs interface. The third and fourth patches implement unit tests and selftests for the feature. Three patches (fifth to seventh) updating the documents follow. The following six patches are for the DAMON monitoring target based DAMOS filter. The eighth patch implements the feature in the core layer and expose it via DAMON's kernel API. The ninth patch further expose it to users via DAMON sysfs interface. Tenth patch add a selftest, and two patches (eleventh and twelfth) update documents. [1] https://lore.kernel.org/damon/20230728203444.70703-1-sj@kernel.org/ This patch (of 13): Users can know special characteristic of specific address ranges. NUMA nodes or special objects or buffers in virtual address space could be such examples. For such cases, DAMOS schemes could required to be applied to only specific address ranges. Implement yet another type of DAMOS filter for the purpose. Note that the existing filter types, namely anon pages and memcg DAMOS filters needed page level type check. Because such check can be done efficiently in the opertions set layer, those filters are handled in operations set layer. Specifically, only paddr operations set implementation supports these filters. Also, because statistics counting is done in the DAMON core layer, the regions that filtered out by these filters are counted as tried but failed to the statistics. Unlike those, address range based filters can efficiently handled in the core layer. Hence, do the handling in the layer, and count the regions that filtered out by those as the scheme has not tried for the region. This difference should clearly documented. Link: https://lkml.kernel.org/r/20230802214312.110532-1-sj@kernel.org Link: https://lkml.kernel.org/r/20230802214312.110532-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 22 +++++++++++++----- mm/damon/core.c | 52 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index d5d4d19928e0ac..476f37a883a415 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -226,16 +226,24 @@ struct damos_stat { * enum damos_filter_type - Type of memory for &struct damos_filter * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @DAMOS_FILTER_TYPE_ADDR: Address range. * @NR_DAMOS_FILTER_TYPES: Number of filter types. * - * The support of each filter type is up to running &struct damon_operations. - * &enum DAMON_OPS_PADDR is supporting all filter types, while - * &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR are not supporting any - * filter types. + * The anon pages type and memcg type filters are handled by underlying + * &struct damon_operations as a part of scheme action trying, and therefore + * accounted as 'tried'. In contrast, other types are handled by core layer + * before trying of the action and therefore not accounted as 'tried'. + * + * The support of the filters that handled by &struct damon_operations depend + * on the running &struct damon_operations. + * &enum DAMON_OPS_PADDR supports both anon pages type and memcg type filters, + * while &enum DAMON_OPS_VADDR and &enum DAMON_OPS_FVADDR don't support any of + * the two types. */ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, + DAMOS_FILTER_TYPE_ADDR, NR_DAMOS_FILTER_TYPES, }; @@ -244,18 +252,20 @@ enum damos_filter_type { * @type: Type of the page. * @matching: If the matching page should filtered out or in. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each * page of the region matches to this and avoid applying the action if so. - * Note that the check support is up to &struct damon_operations - * implementation. + * Support of each filter type depends on the running &struct damon_operations + * and the type. Refer to &enum damos_filter_type for more detai. */ struct damos_filter { enum damos_filter_type type; bool matching; union { unsigned short memcg_id; + struct damon_addr_range addr_range; }; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 91cff7f2997efe..68a5fb1c039d90 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -877,6 +877,56 @@ static void damos_update_stat(struct damos *s, s->stat.sz_applied += sz_applied; } +static bool __damos_filter_out(struct damon_target *t, struct damon_region *r, + struct damos_filter *filter) +{ + bool matched = false; + unsigned long start, end; + + switch (filter->type) { + case DAMOS_FILTER_TYPE_ADDR: + start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); + end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); + + /* inside the range */ + if (start <= r->ar.start && r->ar.end <= end) { + matched = true; + break; + } + /* outside of the range */ + if (r->ar.end <= start || end <= r->ar.start) { + matched = false; + break; + } + /* start before the range and overlap */ + if (r->ar.start < start) { + damon_split_region_at(t, r, start - r->ar.start); + matched = false; + break; + } + /* start inside the range */ + damon_split_region_at(t, r, end - r->ar.start); + matched = true; + break; + default: + break; + } + + return matched == filter->matching; +} + +static bool damos_filter_out(struct damon_target *t, struct damon_region *r, + struct damos *s) +{ + struct damos_filter *filter; + + damos_for_each_filter(filter, s) { + if (__damos_filter_out(t, r, filter)) + return true; + } + return false; +} + static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, struct damon_region *r, struct damos *s) { @@ -894,6 +944,8 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, goto update_stat; damon_split_region_at(t, r, sz); } + if (damos_filter_out(t, r, s)) + return; ktime_get_coarse_ts64(&begin); if (c->callback.before_damos_apply) err = c->callback.before_damos_apply(c, t, r, s); From 2f1abcfccd86826777b2bcb2bb4e0d149a90ccf5 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:01 +0000 Subject: [PATCH 283/489] mm/damon/sysfs-schemes: support address range type DAMOS filter Extend DAMON sysfs interface to support address range based DAMOS filters, by adding a special keyword for the filter//type file, namely 'addr', and two files under filter// for specifying the start and the end addresses of the range, namely 'addr_start' and 'addr_end'. Link: https://lkml.kernel.org/r/20230802214312.110532-3-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 56 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 9a015079f3a46f..03ddba3e216d13 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -282,6 +282,7 @@ struct damon_sysfs_scheme_filter { enum damos_filter_type type; bool matching; char *memcg_path; + struct damon_addr_range addr_range; }; static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) @@ -293,6 +294,7 @@ static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", "memcg", + "addr", }; static ssize_t type_show(struct kobject *kobj, @@ -373,6 +375,44 @@ static ssize_t memcg_path_store(struct kobject *kobj, return count; } +static ssize_t addr_start_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%lu\n", filter->addr_range.start); +} + +static ssize_t addr_start_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoul(buf, 0, &filter->addr_range.start); + + return err ? err : count; +} + +static ssize_t addr_end_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%lu\n", filter->addr_range.end); +} + +static ssize_t addr_end_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoul(buf, 0, &filter->addr_range.end); + + return err ? err : count; +} + static void damon_sysfs_scheme_filter_release(struct kobject *kobj) { struct damon_sysfs_scheme_filter *filter = container_of(kobj, @@ -391,10 +431,18 @@ static struct kobj_attribute damon_sysfs_scheme_filter_matching_attr = static struct kobj_attribute damon_sysfs_scheme_filter_memcg_path_attr = __ATTR_RW_MODE(memcg_path, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr = + __ATTR_RW_MODE(addr_start, 0600); + +static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr = + __ATTR_RW_MODE(addr_end, 0600); + static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, + &damon_sysfs_scheme_filter_addr_start_attr.attr, + &damon_sysfs_scheme_filter_addr_end_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); @@ -1484,7 +1532,15 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, damos_destroy_filter(filter); return err; } + } else if (filter->type == DAMOS_FILTER_TYPE_ADDR) { + if (sysfs_filter->addr_range.end < + sysfs_filter->addr_range.start) { + damos_destroy_filter(filter); + return -EINVAL; + } + filter->addr_range = sysfs_filter->addr_range; } + damos_add_filter(scheme, filter); } return 0; From 26713c8908752a7edca18dcafe88e36dccfb41a2 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:02 +0000 Subject: [PATCH 284/489] mm/damon/core-test: add a unit test for __damos_filter_out() Implement a kunit test for the core of address range DAMOS filter handling, namely __damos_filter_out(). The test especially focus on regions that overlap with given filter's target address range. Link: https://lkml.kernel.org/r/20230802214312.110532-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 61 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 4bddbfe243c3a0..6cc8b245586d7d 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -353,6 +353,66 @@ static void damos_test_new_filter(struct kunit *test) damos_destroy_filter(filter); } +static void damos_test_filter_out(struct kunit *test) +{ + struct damon_target *t; + struct damon_region *r, *r2; + struct damos_filter *f; + + f = damos_new_filter(DAMOS_FILTER_TYPE_ADDR, true); + f->addr_range = (struct damon_addr_range){ + .start = DAMON_MIN_REGION * 2, .end = DAMON_MIN_REGION * 6}; + + t = damon_new_target(); + r = damon_new_region(DAMON_MIN_REGION * 3, DAMON_MIN_REGION * 5); + damon_add_region(r, t); + + /* region in the range */ + KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); + + /* region before the range */ + r->ar.start = DAMON_MIN_REGION * 1; + r->ar.end = DAMON_MIN_REGION * 2; + KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); + + /* region after the range */ + r->ar.start = DAMON_MIN_REGION * 6; + r->ar.end = DAMON_MIN_REGION * 8; + KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1); + + /* region started before the range */ + r->ar.start = DAMON_MIN_REGION * 1; + r->ar.end = DAMON_MIN_REGION * 4; + KUNIT_EXPECT_FALSE(test, __damos_filter_out(NULL, t, r, f)); + /* filter should have split the region */ + KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 1); + KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); + r2 = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 4); + damon_destroy_region(r2, t); + + /* region started in the range */ + r->ar.start = DAMON_MIN_REGION * 2; + r->ar.end = DAMON_MIN_REGION * 8; + KUNIT_EXPECT_TRUE(test, __damos_filter_out(NULL, t, r, f)); + /* filter should have split the region */ + KUNIT_EXPECT_EQ(test, r->ar.start, DAMON_MIN_REGION * 2); + KUNIT_EXPECT_EQ(test, r->ar.end, DAMON_MIN_REGION * 6); + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 2); + r2 = damon_next_region(r); + KUNIT_EXPECT_EQ(test, r2->ar.start, DAMON_MIN_REGION * 6); + KUNIT_EXPECT_EQ(test, r2->ar.end, DAMON_MIN_REGION * 8); + damon_destroy_region(r2, t); + + damon_free_target(t); + damos_free_filter(f); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -366,6 +426,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_update_monitoring_result), KUNIT_CASE(damon_test_set_attrs), KUNIT_CASE(damos_test_new_filter), + KUNIT_CASE(damos_test_filter_out), {}, }; From 4c45c20d53488d05da6e240f3ac4dffbd3963f94 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:03 +0000 Subject: [PATCH 285/489] selftests/damon/sysfs: test address range damos filter Add a selftest for checking existence of addr_{start,end} files under DAMOS filter directory, and 'addr' damos filter type input of DAMON sysfs interface. Link: https://lkml.kernel.org/r/20230802214312.110532-5-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 967e2726754998..5677cfd342fcca 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -103,9 +103,12 @@ test_filter() ensure_file "$filter_dir/type" "exist" "600" ensure_write_succ "$filter_dir/type" "anon" "valid input" ensure_write_succ "$filter_dir/type" "memcg" "valid input" + ensure_write_succ "$filter_dir/type" "addr" "valid input" ensure_write_fail "$filter_dir/type" "foo" "invalid input" ensure_file "$filter_dir/matching" "exist" "600" ensure_file "$filter_dir/memcg_path" "exist" "600" + ensure_file "$filter_dir/addr_start" "exist" "600" + ensure_file "$filter_dir/addr_end" "exist" "600" } test_filters() From 96a7cb23778a4b2f8fbe714cbcfa15d1475d430f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:04 +0000 Subject: [PATCH 286/489] Docs/mm/damon/design: update for address range filters Update DAMON design document's DAMOS filters section for address range DAMOS filters. Because address range filters are handled by the core layer and it makes difference in schemes tried regions and schemes statistics, clearly describe it. Link: https://lkml.kernel.org/r/20230802214312.110532-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 4bfdf1d30c4aed..134912166f5acf 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -380,12 +380,21 @@ number of filters for each scheme. Each filter specifies the type of target memory, and whether it should exclude the memory of the type (filter-out), or all except the memory of the type (filter-in). -As of this writing, anonymous page type and memory cgroup type are supported by -the feature. Some filter target types can require additional arguments. For -example, the memory cgroup filter type asks users to specify the file path of -the memory cgroup for the filter. Hence, users can apply specific schemes to -only anonymous pages, non-anonymous pages, pages of specific cgroups, all pages -excluding those of specific cgroups, and any combination of those. +Currently, anonymous page, memory cgroup, and address range type filters are +supported by the feature. Some filter target types can require additional +arguments. For example, the memory cgroup filter type asks users to specify +the file path of the memory cgroup for the filter, while the address range type +asks the start and end addresses of the range. Hence, users can apply specific +schemes to only anonymous pages, non-anonymous pages, pages of specific +cgroups, all pages excluding those of specific cgroups, pages in specific +address range, and any combination of those. + +To handle filters efficiently, the address range type filter is handled by the +core layer, while others are handled by operations set. If a memory region is +filtered by the core layer-handled filter, it is not counted as the scheme has +tried to the region. In contrast, if a memory regions is filtered by an +operations set layer-handled filter, it is counted as the scheme has tried. +The difference in accounting leads to changes in the statistics. Application Programming Interface From 2beb97fcbf87d7424ee95c76ba08a2b2454406e0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:05 +0000 Subject: [PATCH 287/489] Docs/ABI/damon: update for address range DAMOS filter Update DAMON ABI document for address ranges type DAMOS filter files. Link: https://lkml.kernel.org/r/20230802214312.110532-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../ABI/testing/sysfs-kernel-mm-damon | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 3d9aaa1cafa9bd..0bc074d4618ce5 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -271,8 +271,9 @@ What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters/ Date: Dec 2022 Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the type of - the memory of the interest. 'anon' for anonymous pages, or - 'memcg' for specific memory cgroup can be written and read. + the memory of the interest. 'anon' for anonymous pages, + 'memcg' for specific memory cgroup, or 'addr' for address range + (an open-ended interval) can be written and read. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//memcg_path Date: Dec 2022 @@ -281,6 +282,21 @@ Description: If 'memcg' is written to the 'type' file, writing to and reading from this file sets and gets the path to the memory cgroup of the interest. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//addr_start +Date: Jul 2023 +Contact: SeongJae Park +Description: If 'addr' is written to the 'type' file, writing to or reading + from this file sets or gets the start address of the address + range for the filter. + +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//addr_end +Date: Jul 2023 +Contact: SeongJae Park +Description: If 'addr' is written to the 'type' file, writing to or reading + from this file sets or gets the end address of the address + range for the filter. + + What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching Date: Dec 2022 Contact: SeongJae Park From 375af850385c787fc7115bf304c48b475818e5e4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:06 +0000 Subject: [PATCH 288/489] Docs/admin-guide/mm/damon/usage: update for address range type DAMOS filter Update DAMON usage document for the newly added address range type DAMOS filter. Link: https://lkml.kernel.org/r/20230802214312.110532-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 31 +++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index 1859dd6c383477..a9cb9949b79686 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -363,15 +363,18 @@ number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each filter. The filters are evaluated in the numeric order. -Each filter directory contains three files, namely ``type``, ``matcing``, and -``memcg_path``. You can write one of two special keywords, ``anon`` for -anonymous pages, or ``memcg`` for specific memory cgroup filtering. In case of -the memory cgroup filtering, you can specify the memory cgroup of the interest -by writing the path of the memory cgroup from the cgroups mount point to -``memcg_path`` file. You can write ``Y`` or ``N`` to ``matching`` file to -filter out pages that does or does not match to the type, respectively. Then, -the scheme's action will not be applied to the pages that specified to be -filtered out. +Each filter directory contains five files, namely ``type``, ``matcing``, +``memcg_path``, ``addr_start``, and ``addr_end``. To ``type`` file, you can +write one of three special keywords: ``anon`` for anonymous pages, ``memcg`` +for specific memory cgroup, or ``addr`` for specific address range (an +open-ended interval) filtering. In case of the memory cgroup filtering, you +can specify the memory cgroup of the interest by writing the path of the memory +cgroup from the cgroups mount point to ``memcg_path`` file. In case of the +address range filtering, you can specify the start and end address of the range +to ``addr_start`` and ``addr_end`` files, respectively. You can write ``Y`` or +``N`` to ``matching`` file to filter out pages that does or does not match to +the type, respectively. Then, the scheme's action will not be applied to the +pages that specified to be filtered out. For example, below restricts a DAMOS action to be applied to only non-anonymous pages of all memory cgroups except ``/having_care_already``.:: @@ -385,8 +388,14 @@ pages of all memory cgroups except ``/having_care_already``.:: echo /having_care_already > 1/memcg_path echo N > 1/matching -Note that filters are currently supported only when ``paddr`` -`implementation ` is being used. +Note that ``anon`` and ``memcg`` filters are currently supported only when +``paddr`` `implementation ` is being used. + +Also, memory regions that are filtered out by ``addr`` filters are not counted +as the scheme has tried to those, while regions that filtered out by other type +filters are counted as the scheme has tried to. The difference is applied to +:ref:`stats ` and :ref:`tried regions +`. .. _sysfs_schemes_stats: From 17e7c724d3c2e622c4d9969b7a473e8ed1d14ff0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:07 +0000 Subject: [PATCH 289/489] mm/damon/core: implement target type damos filter One DAMON context can have multiple monitoring targets, and DAMOS schemes are applied to all targets. In some cases, users need to apply different scheme to different targets. Retrieving monitoring results via DAMON sysfs interface' 'tried_regions' directory could be one good example. Also, there could be cases that cgroup DAMOS filter is not enough. All such use cases can be worked around by having multiple DAMON contexts having only single target, but it is inefficient in terms of resource usage, thogh the overhead is not estimated to be huge. Implement DAMON monitoring target based DAMOS filter for the case. Like address range target DAMOS filter, handle these filters in the DAMON core layer, since it is more efficient than doing in operations set layer. This also means that regions that filtered out by monitoring target type DAMOS filters are counted as not tried by the scheme. Hence, target granularity monitoring results retrieval via DAMON sysfs interface becomes available. Link: https://lkml.kernel.org/r/20230802214312.110532-9-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 22 ++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 476f37a883a415..ae2664d1d5f1de 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -227,6 +227,7 @@ struct damos_stat { * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. * @DAMOS_FILTER_TYPE_ADDR: Address range. + * @DAMOS_FILTER_TYPE_TARGET: Data Access Monitoring target. * @NR_DAMOS_FILTER_TYPES: Number of filter types. * * The anon pages type and memcg type filters are handled by underlying @@ -244,6 +245,7 @@ enum damos_filter_type { DAMOS_FILTER_TYPE_ANON, DAMOS_FILTER_TYPE_MEMCG, DAMOS_FILTER_TYPE_ADDR, + DAMOS_FILTER_TYPE_TARGET, NR_DAMOS_FILTER_TYPES, }; @@ -253,6 +255,9 @@ enum damos_filter_type { * @matching: If the matching page should filtered out or in. * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. * @addr_range: Address range if @type is DAMOS_FILTER_TYPE_ADDR. + * @target_idx: Index of the &struct damon_target of + * &damon_ctx->adaptive_targets if @type is + * DAMOS_FILTER_TYPE_TARGET. * @list: List head for siblings. * * Before applying the &damos->action to a memory region, DAMOS checks if each @@ -266,6 +271,7 @@ struct damos_filter { union { unsigned short memcg_id; struct damon_addr_range addr_range; + int target_idx; }; struct list_head list; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 68a5fb1c039d90..c1f1483c5082fd 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -877,13 +877,23 @@ static void damos_update_stat(struct damos *s, s->stat.sz_applied += sz_applied; } -static bool __damos_filter_out(struct damon_target *t, struct damon_region *r, - struct damos_filter *filter) +static bool __damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos_filter *filter) { bool matched = false; + struct damon_target *ti; + int target_idx = 0; unsigned long start, end; switch (filter->type) { + case DAMOS_FILTER_TYPE_TARGET: + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + target_idx++; + } + matched = target_idx == filter->target_idx; + break; case DAMOS_FILTER_TYPE_ADDR: start = ALIGN_DOWN(filter->addr_range.start, DAMON_MIN_REGION); end = ALIGN_DOWN(filter->addr_range.end, DAMON_MIN_REGION); @@ -915,13 +925,13 @@ static bool __damos_filter_out(struct damon_target *t, struct damon_region *r, return matched == filter->matching; } -static bool damos_filter_out(struct damon_target *t, struct damon_region *r, - struct damos *s) +static bool damos_filter_out(struct damon_ctx *ctx, struct damon_target *t, + struct damon_region *r, struct damos *s) { struct damos_filter *filter; damos_for_each_filter(filter, s) { - if (__damos_filter_out(t, r, filter)) + if (__damos_filter_out(ctx, t, r, filter)) return true; } return false; @@ -944,7 +954,7 @@ static void damos_apply_scheme(struct damon_ctx *c, struct damon_target *t, goto update_stat; damon_split_region_at(t, r, sz); } - if (damos_filter_out(t, r, s)) + if (damos_filter_out(c, t, r, s)) return; ktime_get_coarse_ts64(&begin); if (c->callback.before_damos_apply) From 9f6e47abfcb40c2f97f6987fca086ff463de2381 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:08 +0000 Subject: [PATCH 290/489] mm/damon/sysfs-schemes: support target damos filter Extend DAMON sysfs interface to support the DAMON monitoring target based DAMOS filter. Users can use it via writing 'target' to the filter's 'type' file and specifying the index of the target from the corresponding DAMON context's monitoring targets list to 'target_idx' sysfs file. Link: https://lkml.kernel.org/r/20230802214312.110532-10-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/sysfs-schemes.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 03ddba3e216d13..527e7d17eb3b23 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -283,6 +283,7 @@ struct damon_sysfs_scheme_filter { bool matching; char *memcg_path; struct damon_addr_range addr_range; + int target_idx; }; static struct damon_sysfs_scheme_filter *damon_sysfs_scheme_filter_alloc(void) @@ -295,6 +296,7 @@ static const char * const damon_sysfs_scheme_filter_type_strs[] = { "anon", "memcg", "addr", + "target", }; static ssize_t type_show(struct kobject *kobj, @@ -413,6 +415,25 @@ static ssize_t addr_end_store(struct kobject *kobj, return err ? err : count; } +static ssize_t damon_target_idx_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + + return sysfs_emit(buf, "%d\n", filter->target_idx); +} + +static ssize_t damon_target_idx_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + struct damon_sysfs_scheme_filter *filter = container_of(kobj, + struct damon_sysfs_scheme_filter, kobj); + int err = kstrtoint(buf, 0, &filter->target_idx); + + return err ? err : count; +} + static void damon_sysfs_scheme_filter_release(struct kobject *kobj) { struct damon_sysfs_scheme_filter *filter = container_of(kobj, @@ -437,12 +458,16 @@ static struct kobj_attribute damon_sysfs_scheme_filter_addr_start_attr = static struct kobj_attribute damon_sysfs_scheme_filter_addr_end_attr = __ATTR_RW_MODE(addr_end, 0600); +static struct kobj_attribute damon_sysfs_scheme_filter_damon_target_idx_attr = + __ATTR_RW_MODE(damon_target_idx, 0600); + static struct attribute *damon_sysfs_scheme_filter_attrs[] = { &damon_sysfs_scheme_filter_type_attr.attr, &damon_sysfs_scheme_filter_matching_attr.attr, &damon_sysfs_scheme_filter_memcg_path_attr.attr, &damon_sysfs_scheme_filter_addr_start_attr.attr, &damon_sysfs_scheme_filter_addr_end_attr.attr, + &damon_sysfs_scheme_filter_damon_target_idx_attr.attr, NULL, }; ATTRIBUTE_GROUPS(damon_sysfs_scheme_filter); @@ -1539,6 +1564,8 @@ static int damon_sysfs_set_scheme_filters(struct damos *scheme, return -EINVAL; } filter->addr_range = sysfs_filter->addr_range; + } else if (filter->type == DAMOS_FILTER_TYPE_TARGET) { + filter->target_idx = sysfs_filter->target_idx; } damos_add_filter(scheme, filter); From 9628ace840614abcdd99abc0b313edc4c6d0b1ad Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:09 +0000 Subject: [PATCH 291/489] selftests/damon/sysfs: test damon_target filter Test existence of files and validity of input keyword for DAMON monitoring target based DAMOS filter on DAMON sysfs interface. Link: https://lkml.kernel.org/r/20230802214312.110532-11-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/sysfs.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/selftests/damon/sysfs.sh b/tools/testing/selftests/damon/sysfs.sh index 5677cfd342fcca..60a9a305aef071 100644 --- a/tools/testing/selftests/damon/sysfs.sh +++ b/tools/testing/selftests/damon/sysfs.sh @@ -104,11 +104,13 @@ test_filter() ensure_write_succ "$filter_dir/type" "anon" "valid input" ensure_write_succ "$filter_dir/type" "memcg" "valid input" ensure_write_succ "$filter_dir/type" "addr" "valid input" + ensure_write_succ "$filter_dir/type" "target" "valid input" ensure_write_fail "$filter_dir/type" "foo" "invalid input" ensure_file "$filter_dir/matching" "exist" "600" ensure_file "$filter_dir/memcg_path" "exist" "600" ensure_file "$filter_dir/addr_start" "exist" "600" ensure_file "$filter_dir/addr_end" "exist" "600" + ensure_file "$filter_dir/damon_target_idx" "exist" "600" } test_filters() From 08ad3bb3edc08b64762dadbfee5bd00143d7f40e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:10 +0000 Subject: [PATCH 292/489] Docs/mm/damon/design: update for DAMON monitoring target type DAMOS filter Update DAMON design document for the newly added DAMON monitoring target type DAMOS filter. Link: https://lkml.kernel.org/r/20230802214312.110532-12-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/mm/damon/design.rst | 33 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/Documentation/mm/damon/design.rst b/Documentation/mm/damon/design.rst index 134912166f5acf..a20383d01a95d6 100644 --- a/Documentation/mm/damon/design.rst +++ b/Documentation/mm/damon/design.rst @@ -380,21 +380,24 @@ number of filters for each scheme. Each filter specifies the type of target memory, and whether it should exclude the memory of the type (filter-out), or all except the memory of the type (filter-in). -Currently, anonymous page, memory cgroup, and address range type filters are -supported by the feature. Some filter target types can require additional -arguments. For example, the memory cgroup filter type asks users to specify -the file path of the memory cgroup for the filter, while the address range type -asks the start and end addresses of the range. Hence, users can apply specific -schemes to only anonymous pages, non-anonymous pages, pages of specific -cgroups, all pages excluding those of specific cgroups, pages in specific -address range, and any combination of those. - -To handle filters efficiently, the address range type filter is handled by the -core layer, while others are handled by operations set. If a memory region is -filtered by the core layer-handled filter, it is not counted as the scheme has -tried to the region. In contrast, if a memory regions is filtered by an -operations set layer-handled filter, it is counted as the scheme has tried. -The difference in accounting leads to changes in the statistics. +Currently, anonymous page, memory cgroup, address range, and DAMON monitoring +target type filters are supported by the feature. Some filter target types +require additional arguments. The memory cgroup filter type asks users to +specify the file path of the memory cgroup for the filter. The address range +type asks the start and end addresses of the range. The DAMON monitoring +target type asks the index of the target from the context's monitoring targets +list. Hence, users can apply specific schemes to only anonymous pages, +non-anonymous pages, pages of specific cgroups, all pages excluding those of +specific cgroups, pages in specific address range, pages in specific DAMON +monitoring targets, and any combination of those. + +To handle filters efficiently, the address range and DAMON monitoring target +type filters are handled by the core layer, while others are handled by +operations set. If a memory region is filtered by a core layer-handled filter, +it is not counted as the scheme has tried to the region. In contrast, if a +memory regions is filtered by an operations set layer-handled filter, it is +counted as the scheme has tried. The difference in accounting leads to changes +in the statistics. Application Programming Interface From d3d21d91ae93210bcc8e7bf9af81bda5124a3b3b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:11 +0000 Subject: [PATCH 293/489] Docs/ABI/damon: update for DAMON monitoring target type DAMOS filter Update DAMON ABI document for the newly added DAMON monitoring target type DAMOS filter. Link: https://lkml.kernel.org/r/20230802214312.110532-13-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/ABI/testing/sysfs-kernel-mm-damon | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-damon b/Documentation/ABI/testing/sysfs-kernel-mm-damon index 0bc074d4618ce5..334352d198f8fc 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-damon +++ b/Documentation/ABI/testing/sysfs-kernel-mm-damon @@ -272,8 +272,9 @@ Date: Dec 2022 Contact: SeongJae Park Description: Writing to and reading from this file sets and gets the type of the memory of the interest. 'anon' for anonymous pages, - 'memcg' for specific memory cgroup, or 'addr' for address range - (an open-ended interval) can be written and read. + 'memcg' for specific memory cgroup, 'addr' for address range + (an open-ended interval), or 'target' for DAMON monitoring + target can be written and read. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//memcg_path Date: Dec 2022 @@ -296,6 +297,12 @@ Description: If 'addr' is written to the 'type' file, writing to or reading from this file sets or gets the end address of the address range for the filter. +What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//target_idx +Date: Dec 2022 +Contact: SeongJae Park +Description: If 'target' is written to the 'type' file, writing to or + reading from this file sets or gets the index of the DAMON + monitoring target of the interest. What: /sys/kernel/mm/damon/admin/kdamonds//contexts//schemes//filters//matching Date: Dec 2022 From 41a7ed8cfd54d85adbb5f48cd3673dd7ff0eb450 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Wed, 2 Aug 2023 21:43:12 +0000 Subject: [PATCH 294/489] Docs/admin-guide/mm/damon/usage: update for DAMON monitoring target type DAMOS filter Update DAMON usage document for newly added DAMON monitoring target type DAMOS filter. Link: https://lkml.kernel.org/r/20230802214312.110532-14-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 37 +++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index a9cb9949b79686..084f0a32b421ab 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -363,18 +363,21 @@ number (``N``) to the file creates the number of child directories named ``0`` to ``N-1``. Each directory represents each filter. The filters are evaluated in the numeric order. -Each filter directory contains five files, namely ``type``, ``matcing``, -``memcg_path``, ``addr_start``, and ``addr_end``. To ``type`` file, you can -write one of three special keywords: ``anon`` for anonymous pages, ``memcg`` -for specific memory cgroup, or ``addr`` for specific address range (an -open-ended interval) filtering. In case of the memory cgroup filtering, you -can specify the memory cgroup of the interest by writing the path of the memory -cgroup from the cgroups mount point to ``memcg_path`` file. In case of the -address range filtering, you can specify the start and end address of the range -to ``addr_start`` and ``addr_end`` files, respectively. You can write ``Y`` or -``N`` to ``matching`` file to filter out pages that does or does not match to -the type, respectively. Then, the scheme's action will not be applied to the -pages that specified to be filtered out. +Each filter directory contains six files, namely ``type``, ``matcing``, +``memcg_path``, ``addr_start``, ``addr_end``, and ``target_idx``. To ``type`` +file, you can write one of four special keywords: ``anon`` for anonymous pages, +``memcg`` for specific memory cgroup, ``addr`` for specific address range (an +open-ended interval), or ``target`` for specific DAMON monitoring target +filtering. In case of the memory cgroup filtering, you can specify the memory +cgroup of the interest by writing the path of the memory cgroup from the +cgroups mount point to ``memcg_path`` file. In case of the address range +filtering, you can specify the start and end address of the range to +``addr_start`` and ``addr_end`` files, respectively. For the DAMON monitoring +target filtering, you can specify the index of the target between the list of +the DAMON context's monitoring targets list to ``target_idx`` file. You can +write ``Y`` or ``N`` to ``matching`` file to filter out pages that does or does +not match to the type, respectively. Then, the scheme's action will not be +applied to the pages that specified to be filtered out. For example, below restricts a DAMOS action to be applied to only non-anonymous pages of all memory cgroups except ``/having_care_already``.:: @@ -391,11 +394,11 @@ pages of all memory cgroups except ``/having_care_already``.:: Note that ``anon`` and ``memcg`` filters are currently supported only when ``paddr`` `implementation ` is being used. -Also, memory regions that are filtered out by ``addr`` filters are not counted -as the scheme has tried to those, while regions that filtered out by other type -filters are counted as the scheme has tried to. The difference is applied to -:ref:`stats ` and :ref:`tried regions -`. +Also, memory regions that are filtered out by ``addr`` or ``target`` filters +are not counted as the scheme has tried to those, while regions that filtered +out by other type filters are counted as the scheme has tried to. The +difference is applied to :ref:`stats ` and +:ref:`tried regions `. .. _sysfs_schemes_stats: From 73d4719363371afdcd63143c3532fa6a9443de13 Mon Sep 17 00:00:00 2001 From: Ruan Jinjie Date: Thu, 3 Aug 2023 19:38:23 +0800 Subject: [PATCH 295/489] mm/z3fold: use helper function put_z3fold_locked() and put_z3fold_locked_list() This code is already duplicated six times, use helper function put_z3fold_locked() to release z3fold page instead of open code it to help improve code readability a bit. And add put_z3fold_locked_list() helper function to be consistent with it. No functional change involved. Link: https://lkml.kernel.org/r/20230803113824.886413-1-ruanjinjie@huawei.com Signed-off-by: Ruan Jinjie Reviewed-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index e84de91ecccb03..7952adf9bede6f 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -480,6 +480,16 @@ static void release_z3fold_page_locked_list(struct kref *ref) __release_z3fold_page(zhdr, true); } +static inline int put_z3fold_locked(struct z3fold_header *zhdr) +{ + return kref_put(&zhdr->refcount, release_z3fold_page_locked); +} + +static inline int put_z3fold_locked_list(struct z3fold_header *zhdr) +{ + return kref_put(&zhdr->refcount, release_z3fold_page_locked_list); +} + static void free_pages_work(struct work_struct *w) { struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work); @@ -666,7 +676,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr) return new_zhdr; out_fail: - if (new_zhdr && !kref_put(&new_zhdr->refcount, release_z3fold_page_locked)) { + if (new_zhdr && !put_z3fold_locked(new_zhdr)) { add_to_unbuddied(pool, new_zhdr); z3fold_page_unlock(new_zhdr); } @@ -741,7 +751,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) list_del_init(&zhdr->buddy); spin_unlock(&pool->lock); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + if (put_z3fold_locked(zhdr)) return; if (test_bit(PAGE_STALE, &page->private) || @@ -752,7 +762,7 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked) if (!zhdr->foreign_handles && buddy_single(zhdr) && zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) { - if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + if (!put_z3fold_locked(zhdr)) { clear_bit(PAGE_CLAIMED, &page->private); z3fold_page_unlock(zhdr); } @@ -878,7 +888,7 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool, return zhdr; out_fail: - if (!kref_put(&zhdr->refcount, release_z3fold_page_locked)) { + if (!put_z3fold_locked(zhdr)) { add_to_unbuddied(pool, zhdr); z3fold_page_unlock(zhdr); } @@ -1012,8 +1022,7 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, if (zhdr) { bud = get_free_buddy(zhdr, chunks); if (bud == HEADLESS) { - if (!kref_put(&zhdr->refcount, - release_z3fold_page_locked)) + if (!put_z3fold_locked(zhdr)) z3fold_page_unlock(zhdr); pr_err("No free chunks in unbuddied\n"); WARN_ON(1); @@ -1129,7 +1138,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) if (!page_claimed) free_handle(handle, zhdr); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) + if (put_z3fold_locked_list(zhdr)) return; if (page_claimed) { /* the page has not been claimed by us */ @@ -1346,7 +1355,7 @@ static void z3fold_page_putback(struct page *page) if (!list_empty(&zhdr->buddy)) list_del_init(&zhdr->buddy); INIT_LIST_HEAD(&page->lru); - if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) + if (put_z3fold_locked(zhdr)) return; if (list_empty(&zhdr->buddy)) add_to_unbuddied(pool, zhdr); From c1dc69e6ce65d95e3d4d080868c3007f1a6fc4fe Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 3 Aug 2023 19:49:34 +0800 Subject: [PATCH 296/489] mm/page_alloc: remove unneeded variable base Since commit 5d0a661d808f ("mm/page_alloc: use only one PCP list for THP-sized allocations"), local variable base is just as same as order. So remove it. No functional change intended. Link: https://lkml.kernel.org/r/20230803114934.693989-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b17dcbb925da9..94f9e159a18df7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -538,8 +538,6 @@ static void bad_page(struct page *page, const char *reason) static inline unsigned int order_to_pindex(int migratetype, int order) { - int base = order; - #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { VM_BUG_ON(order != pageblock_order); @@ -549,7 +547,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif - return (MIGRATE_PCPTYPES * base) + migratetype; + return (MIGRATE_PCPTYPES * order) + migratetype; } static inline int pindex_to_order(unsigned int pindex) From 3a1060c2615874bd4d66d72dfdabbc48496ef040 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 3 Aug 2023 20:00:21 +0800 Subject: [PATCH 297/489] mm/memcg: fix wrong function name above obj_cgroup_charge_zswap() The correct function name is obj_cgroup_may_zswap(). Correct the comment. Link: https://lkml.kernel.org/r/20230803120021.762279-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Johannes Weiner Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56abc4f426f4dc..da9f983a090e1d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7777,7 +7777,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) * @objcg: the object cgroup * @size: size of compressed object * - * This forces the charge after obj_cgroup_may_swap() allowed + * This forces the charge after obj_cgroup_may_zswap() allowed * compression and storage in zwap for this cgroup to go ahead. */ void obj_cgroup_charge_zswap(struct obj_cgroup *objcg, size_t size) From 16951789008dc0029b1e073fb1c20c1abb4c6504 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:48:58 +0800 Subject: [PATCH 298/489] mm/compaction: set compact_cached_free_pfn correctly in update_pageblock_skip Patch series "Fixes and cleanups to compaction", v2. This series contains random fixes and cleanups to free page isolation in compaction. This is based on another compact series[1]. More details can be found in respective patches. This patch (of 4): We will set skip to page block of block_start_pfn, it's more reasonable to set compact_cached_free_pfn to page block before the block_start_pfn. Link: https://lkml.kernel.org/r/20230803094901.2915942-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230803094901.2915942-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Cc: Kemeng Shi Signed-off-by: Andrew Morton --- mm/compaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index c4d3a3129fd535..6e0c7456026b55 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1717,7 +1717,8 @@ static void isolate_freepages(struct compact_control *cc) /* Update the skip hint if the full pageblock was scanned */ if (isolate_start_pfn == block_end_pfn) - update_pageblock_skip(cc, page, block_start_pfn); + update_pageblock_skip(cc, page, block_start_pfn - + pageblock_nr_pages); /* Are enough freepages isolated? */ if (cc->nr_freepages >= cc->nr_migratepages) { From a2864a67452ec6e378e57cbe151aad62ccdcc03f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:48:59 +0800 Subject: [PATCH 299/489] mm/compaction: merge end_pfn boundary check in isolate_freepages_range Merge the end_pfn boundary check for single page block forward and multiple page blocks forward to avoid do twice boundary check for multiple page blocks forward. Link: https://lkml.kernel.org/r/20230803094901.2915942-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 6e0c7456026b55..d32929f39dc472 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -740,8 +740,6 @@ isolate_freepages_range(struct compact_control *cc, /* Protect pfn from changing by isolate_freepages_block */ unsigned long isolate_start_pfn = pfn; - block_end_pfn = min(block_end_pfn, end_pfn); - /* * pfn could pass the block_end_pfn if isolated freepage * is more than pageblock order. In this case, we adjust @@ -750,9 +748,10 @@ isolate_freepages_range(struct compact_control *cc, if (pfn >= block_end_pfn) { block_start_pfn = pageblock_start_pfn(pfn); block_end_pfn = pageblock_end_pfn(pfn); - block_end_pfn = min(block_end_pfn, end_pfn); } + block_end_pfn = min(block_end_pfn, end_pfn); + if (!pageblock_pfn_to_page(block_start_pfn, block_end_pfn, cc->zone)) break; From dc13292cccfd50916af00a471208fb48deb4d72f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:49:00 +0800 Subject: [PATCH 300/489] mm/compaction: remove unnecessary cursor page in isolate_freepages_block The cursor is only used for page forward currently. We can simply move page forward directly to remove unnecessary cursor. Link: https://lkml.kernel.org/r/20230803094901.2915942-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Kemeng Shi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index d32929f39dc472..82fd543b410e1e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -589,7 +589,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, bool strict) { int nr_scanned = 0, total_isolated = 0; - struct page *cursor; + struct page *page; unsigned long flags = 0; bool locked = false; unsigned long blockpfn = *start_pfn; @@ -599,12 +599,11 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (strict) stride = 1; - cursor = pfn_to_page(blockpfn); + page = pfn_to_page(blockpfn); /* Isolate free pages. */ - for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) { + for (; blockpfn < end_pfn; blockpfn += stride, page += stride) { int isolated; - struct page *page = cursor; /* * Periodically drop the lock (if held) regardless of its @@ -629,7 +628,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, if (likely(order <= MAX_ORDER)) { blockpfn += (1UL << order) - 1; - cursor += (1UL << order) - 1; + page += (1UL << order) - 1; nr_scanned += (1UL << order) - 1; } goto isolate_fail; @@ -666,7 +665,7 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, } /* Advance to the end of split page */ blockpfn += isolated - 1; - cursor += isolated - 1; + page += isolated - 1; continue; isolate_fail: From 13cfd63f3fec403ca8966079972aac4565fcf379 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Thu, 3 Aug 2023 17:49:01 +0800 Subject: [PATCH 301/489] mm/compaction: remove unnecessary "else continue" at end of loop in isolate_freepages_block There is no behavior change to remove "else continue" code at end of scan loop. Just remove it to make code cleaner. Link: https://lkml.kernel.org/r/20230803094901.2915942-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Kemeng Shi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 82fd543b410e1e..dc16efd5fac56a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -671,8 +671,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, isolate_fail: if (strict) break; - else - continue; } From f720b471fdb35619402293dcd421761fb1942e27 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 1 Aug 2023 10:31:44 +0800 Subject: [PATCH 302/489] mm: hugetlb: use flush_hugetlb_tlb_range() in move_hugetlb_page_tables() Archs may need to do special things when flushing hugepage tlb, so use the more applicable flush_hugetlb_tlb_range() instead of flush_tlb_range(). Link: https://lkml.kernel.org/r/20230801023145.17026-2-wangkefeng.wang@huawei.com Fixes: 550a7d60bd5e ("mm, hugepages: add mremap() support for hugepage backed vma") Signed-off-by: Kefeng Wang Reviewed-by: Mike Kravetz Acked-by: Muchun Song Cc: Barry Song <21cnbao@gmail.com> Cc: Catalin Marinas Cc: Joel Fernandes (Google) Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Mina Almasry Cc: Will Deacon Cc: William Kucharski Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 26e87d6cc92f93..102f83bd3a9f4a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5279,9 +5279,9 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, } if (shared_pmd) - flush_tlb_range(vma, range.start, range.end); + flush_hugetlb_tlb_range(vma, range.start, range.end); else - flush_tlb_range(vma, old_end - len, old_end); + flush_hugetlb_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); hugetlb_vma_unlock_write(vma); From 9cf6a060f95578c8147bdacdf55a1eaaa182ce49 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 2 Aug 2023 09:27:31 +0800 Subject: [PATCH 303/489] arm64: hugetlb: enable __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE It is better to use huge page size instead of PAGE_SIZE for stride when flush hugepage, which reduces the loop in __flush_tlb_range(). Let's support arch's flush_hugetlb_tlb_range(), which is used in hugetlb_unshare_all_pmds(), move_hugetlb_page_tables() and hugetlb_change_protection() for now. Note,: for hugepages based on contiguous bit, it has to be invalidated individually since the contiguous PTE bit is just a hint, the hardware may or may not take it into account. Link: https://lkml.kernel.org/r/20230802012731.62512-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Muchun Song Reviewed-by: Catalin Marinas Cc: Barry Song <21cnbao@gmail.com> Cc: Joel Fernandes (Google) Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Cc: Mina Almasry Cc: Will Deacon Cc: William Kucharski Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 6a4a1ab8eb238f..a91d6219aa7883 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -60,4 +60,19 @@ extern void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, #include +#define __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE +static inline void flush_hugetlb_tlb_range(struct vm_area_struct *vma, + unsigned long start, + unsigned long end) +{ + unsigned long stride = huge_page_size(hstate_vma(vma)); + + if (stride == PMD_SIZE) + __flush_tlb_range(vma, start, end, stride, false, 2); + else if (stride == PUD_SIZE) + __flush_tlb_range(vma, start, end, stride, false, 1); + else + __flush_tlb_range(vma, start, end, PAGE_SIZE, false, 0); +} + #endif /* __ASM_HUGETLB_H */ From dbdd2a989f2357d40f0c5a440ca81bf1390f11ba Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 4 Aug 2023 08:43:37 +0200 Subject: [PATCH 304/489] mm: no need to export mm_kobj There are no modules using mm_kobj, so do not export it. Link: https://lkml.kernel.org/r/2023080436-algebra-cabana-417d@gregkh Signed-off-by: Greg Kroah-Hartman Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Miaohe Lin Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/mm_init.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 641c56fd08a286..a2fbaa8d917fc9 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -154,7 +154,6 @@ early_param("mminit_loglevel", set_mminit_loglevel); #endif /* CONFIG_DEBUG_MEMORY_INIT */ struct kobject *mm_kobj; -EXPORT_SYMBOL_GPL(mm_kobj); #ifdef CONFIG_SMP s32 vm_committed_as_batch = 32; From 83d97f620f611ab3fbf2de585bf34bd9dab513c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:46 -0400 Subject: [PATCH 305/489] maple_tree: add hex output to maple_arange64 dump Patch series "maple_tree: Change replacement strategy". The maple tree marks nodes dead as soon as they are going to be replaced. This could be problematic when used in the RCU context since the writer may be starved of CPU time by the readers. This patch set addresses the issue by switching the data replacement strategy to one that will only mark data as dead once the new data is available. This series changes the ordering of the node replacement so that the new data is live before the old data is marked 'dead'. When readers hit 'dead' nodes, they will restart from the top of the tree and end up in the new data. In more complex scenarios, the replacement strategy means a subtree is built and graphed into the tree leaving some nodes to point to the old parent. The view of tasks into the old data will either remain with the old data, or see the new data once the old data is marked 'dead'. Iterators will see the 'dead' node and restart on their own and switch to the new data. There is no risk of the reader seeing old data in these cases. The 'dead' subtree of data is then fully marked dead, but reused nodes will still point to the dead nodes until the parent pointer is updated. Walking up to a 'dead' node will cause a re-walk from the top of the tree and enter the new data area where old data is not reachable. Once the parent pointers are fully up to date in the active data, the 'dead' subtree is iterated to collect entirely 'dead' subtrees, and dead nodes (nodes that partially contained reused data). This patch (of 6): When dumping the tree, honour formatting request to output hex for the maple node type arange64. Link: https://lkml.kernel.org/r/20230804165951.2661157-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20230804165951.2661157-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a3d602cfd03029..880ce0fcdcac01 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6833,11 +6833,27 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, int i; pr_cont(" contents: "); - for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) - pr_cont("%lu ", node->gap[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { + switch (format) { + case mt_dump_hex: + pr_cont("%lx ", node->gap[i]); + break; + default: + case mt_dump_dec: + pr_cont("%lu ", node->gap[i]); + } + } pr_cont("| %02X %02X| ", node->meta.end, node->meta.gap); - for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { + switch (format) { + case mt_dump_hex: + pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + break; + default: + case mt_dump_dec: + pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + } + } pr_cont("%p\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; From 72bcf4aa86ece2b49fbdc7fe83d3a05c7ebcfc97 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:47 -0400 Subject: [PATCH 306/489] maple_tree: reorder replacement of nodes to avoid live lock Replacing nodes may cause a live lock-up if CPU resources are saturated by write operations on the tree by continuously retrying on dead nodes. To avoid the continuous retry scenario, ensure the new node is inserted into the tree prior to marking the old data as dead. This will define a window where old and new data is swapped. When reusing lower level nodes, ensure the parent pointer is updated after the parent is marked dead. This ensures that the child is still reachable from the top of the tree, but walking up to a dead node will result in a single retry that will start a fresh walk from the top down through the new node. Link: https://lkml.kernel.org/r/20230804165951.2661157-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 56 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 880ce0fcdcac01..0d4573a8d1345f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1756,6 +1756,36 @@ static inline void mas_replace(struct ma_state *mas, bool advanced) } } +/* + * mas_replace_node() - Replace a node by putting it in the tree, marking it + * dead, and freeing it. + * the parent encoding to locate the maple node in the tree. + * @mas - the ma_state with @mas->node pointing to the new node. + * @old_enode - The old maple encoded node. + */ +static inline void mas_replace_node(struct ma_state *mas, + struct maple_enode *old_enode) + __must_hold(mas->tree->ma_lock) +{ + if (mte_is_root(mas->node)) { + mas_mn(mas)->parent = ma_parent_ptr( + ((unsigned long)mas->tree | MA_ROOT_PARENT)); + rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); + mas_set_height(mas); + } else { + unsigned char offset = 0; + void __rcu **slots = NULL; + + offset = mte_parent_slot(mas->node); + slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); + rcu_assign_pointer(slots[offset], mas->node); + } + + mte_set_node_dead(old_enode); + mas_free(mas, old_enode); +} + /* * mas_new_child() - Find the new child of a node. * @mas: the maple state @@ -3176,7 +3206,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end { enum maple_type mt = mte_node_type(mas->node); struct maple_node reuse, *newnode, *parent, *new_left, *left, *node; - struct maple_enode *eparent; + struct maple_enode *eparent, *old_eparent; unsigned char offset, tmp, split = mt_slots[mt] / 2; void __rcu **l_slots, **slots; unsigned long *l_pivs, *pivs, gap; @@ -3218,7 +3248,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end l_mas.max = l_pivs[split]; mas->min = l_mas.max + 1; - eparent = mt_mk_node(mte_parent(l_mas.node), + old_eparent = mt_mk_node(mte_parent(l_mas.node), mas_parent_type(&l_mas, l_mas.node)); tmp += end; if (!in_rcu) { @@ -3234,7 +3264,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end memcpy(node, newnode, sizeof(struct maple_node)); ma_set_meta(node, mt, 0, tmp - 1); - mte_set_pivot(eparent, mte_parent_slot(l_mas.node), + mte_set_pivot(old_eparent, mte_parent_slot(l_mas.node), l_pivs[split]); /* Remove data from l_pivs. */ @@ -3242,6 +3272,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end memset(l_pivs + tmp, 0, sizeof(unsigned long) * (max_p - tmp)); memset(l_slots + tmp, 0, sizeof(void *) * (max_s - tmp)); ma_set_meta(left, mt, 0, split); + eparent = old_eparent; goto done; } @@ -3266,7 +3297,7 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end parent = mas_pop_node(mas); slots = ma_slots(parent, mt); pivs = ma_pivots(parent, mt); - memcpy(parent, mte_to_node(eparent), sizeof(struct maple_node)); + memcpy(parent, mte_to_node(old_eparent), sizeof(struct maple_node)); rcu_assign_pointer(slots[offset], mas->node); rcu_assign_pointer(slots[offset - 1], l_mas.node); pivs[offset - 1] = l_mas.max; @@ -3278,8 +3309,10 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end mte_set_gap(eparent, mte_parent_slot(l_mas.node), gap); mas_ascend(mas); - if (in_rcu) - mas_replace(mas, false); + if (in_rcu) { + mas_replace_node(mas, old_eparent); + mas_adopt_children(mas, mas->node); + } mas_update_gap(mas); } @@ -3596,11 +3629,13 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node, unsigned char end) { struct maple_node *node; + struct maple_enode *old_enode; unsigned char b_end = b_node->b_end; enum maple_type b_type = b_node->type; + old_enode = wr_mas->mas->node; if ((b_end < mt_min_slots[b_type]) && - (!mte_is_root(wr_mas->mas->node)) && + (!mte_is_root(old_enode)) && (mas_mt_height(wr_mas->mas) > 1)) return mas_rebalance(wr_mas->mas, b_node); @@ -3618,7 +3653,7 @@ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, node->parent = mas_mn(wr_mas->mas)->parent; wr_mas->mas->node = mt_mk_node(node, b_type); mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false); - mas_replace(wr_mas->mas, false); + mas_replace_node(wr_mas->mas, old_enode); reuse_node: mas_update_gap(wr_mas->mas); return 1; @@ -4117,9 +4152,10 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, done: mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end); if (in_rcu) { - mte_set_node_dead(mas->node); + struct maple_enode *old_enode = mas->node; + mas->node = mt_mk_node(newnode, wr_mas->type); - mas_replace(mas, false); + mas_replace_node(mas, old_enode); } else { memcpy(wr_mas->node, newnode, sizeof(struct maple_node)); } From 1238f6a226dc27ec34d229b71b02f0d6c46bbf11 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:48 -0400 Subject: [PATCH 307/489] maple_tree: introduce mas_put_in_tree() mas_replace() has a single user that takes a flag which is now always true. Replace this function with mas_put_in_tree() to better align with mas_replace_node(). Inline the remaining logic into the only caller; mas_wmb_replace(). Link: https://lkml.kernel.org/r/20230804165951.2661157-4-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 73 ++++++++++++++++++------------------------------ 1 file changed, 27 insertions(+), 46 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0d4573a8d1345f..c01b1be1480c7d 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1715,45 +1715,32 @@ static inline void mas_adopt_children(struct ma_state *mas, } /* - * mas_replace() - Replace a maple node in the tree with mas->node. Uses the - * parent encoding to locate the maple node in the tree. - * @mas - the ma_state to use for operations. - * @advanced - boolean to adopt the child nodes and free the old node (false) or - * leave the node (true) and handle the adoption and free elsewhere. + * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old + * node as dead. + * @mas - the maple state with the new node + * @old_enode - The old maple encoded node to replace. */ -static inline void mas_replace(struct ma_state *mas, bool advanced) +static inline void mas_put_in_tree(struct ma_state *mas, + struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { - struct maple_node *mn = mas_mn(mas); - struct maple_enode *old_enode; - unsigned char offset = 0; - void __rcu **slots = NULL; - - if (ma_is_root(mn)) { - old_enode = mas_root_locked(mas); - } else { - offset = mte_parent_slot(mas->node); - slots = ma_slots(mte_parent(mas->node), - mas_parent_type(mas, mas->node)); - old_enode = mas_slot_locked(mas, slots, offset); - } - - if (!advanced && !mte_is_leaf(mas->node)) - mas_adopt_children(mas, mas->node); + unsigned char offset; + void __rcu **slots; if (mte_is_root(mas->node)) { - mn->parent = ma_parent_ptr( + mas_mn(mas)->parent = ma_parent_ptr( ((unsigned long)mas->tree | MA_ROOT_PARENT)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { + + offset = mte_parent_slot(mas->node); + slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); rcu_assign_pointer(slots[offset], mas->node); } - if (!advanced) { - mte_set_node_dead(old_enode); - mas_free(mas, old_enode); - } + mte_set_node_dead(old_enode); } /* @@ -1767,22 +1754,7 @@ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) __must_hold(mas->tree->ma_lock) { - if (mte_is_root(mas->node)) { - mas_mn(mas)->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); - rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - mas_set_height(mas); - } else { - unsigned char offset = 0; - void __rcu **slots = NULL; - - offset = mte_parent_slot(mas->node); - slots = ma_slots(mte_parent(mas->node), - mas_parent_type(mas, mas->node)); - rcu_assign_pointer(slots[offset], mas->node); - } - - mte_set_node_dead(old_enode); + mas_put_in_tree(mas, old_enode); mas_free(mas, old_enode); } @@ -2789,11 +2761,20 @@ static inline void mas_wmb_replace(struct ma_state *mas, struct ma_topiary *free, struct ma_topiary *destroy) { - /* All nodes must see old data as dead prior to replacing that data */ - smp_wmb(); /* Needed for RCU */ + struct maple_enode *old_enode; + + if (mte_is_root(mas->node)) { + old_enode = mas_root_locked(mas); + } else { + unsigned char offset = mte_parent_slot(mas->node); + void __rcu **slots = ma_slots(mte_parent(mas->node), + mas_parent_type(mas, mas->node)); + + old_enode = mas_slot_locked(mas, slots, offset); + } /* Insert the new data in the tree */ - mas_replace(mas, true); + mas_put_in_tree(mas, old_enode); if (!mte_is_leaf(mas->node)) mas_descend_adopt(mas); From 4ffc2ee2cf01f3d03977fbeb1b43da2dc22a95f4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:49 -0400 Subject: [PATCH 308/489] maple_tree: introduce mas_tree_parent() definition Add a definition to shorten long code lines and clarify what the code is doing. Use the new definition to get the maple tree parent pointer from the maple state where possible. Link: https://lkml.kernel.org/r/20230804165951.2661157-5-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c01b1be1480c7d..cf41e0dbb87b22 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -75,6 +75,7 @@ #define MA_STATE_PREALLOC 4 #define ma_parent_ptr(x) ((struct maple_pnode *)(x)) +#define mas_tree_parent(x) ((unsigned long)(x->tree) | MA_ROOT_PARENT) #define ma_mnode_ptr(x) ((struct maple_node *)(x)) #define ma_enode_ptr(x) ((struct maple_enode *)(x)) static struct kmem_cache *maple_node_cache; @@ -1728,8 +1729,7 @@ static inline void mas_put_in_tree(struct ma_state *mas, void __rcu **slots; if (mte_is_root(mas->node)) { - mas_mn(mas)->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); + mas_mn(mas)->parent = ma_parent_ptr(mas_tree_parent(mas)); rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); mas_set_height(mas); } else { @@ -2798,8 +2798,7 @@ static inline void mas_wmb_replace(struct ma_state *mas, static inline void mast_new_root(struct maple_subtree_state *mast, struct ma_state *mas) { - mas_mn(mast->l)->parent = - ma_parent_ptr(((unsigned long)mas->tree | MA_ROOT_PARENT)); + mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); if (!mte_dead_node(mast->orig_l->node) && !mte_is_root(mast->orig_l->node)) { do { @@ -3661,8 +3660,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); - node->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); + node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); if (mas->index) { @@ -3938,8 +3936,7 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); - node->parent = ma_parent_ptr( - ((unsigned long)mas->tree | MA_ROOT_PARENT)); + node->parent = ma_parent_ptr(mas_tree_parent(mas)); mas->node = mt_mk_node(node, type); rcu_assign_pointer(slots[0], entry); pivots[0] = mas->last; From 068bafcac0b89ee5b1616793231eb4b3dd41e3f0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:50 -0400 Subject: [PATCH 309/489] maple_tree: change mas_adopt_children() parent usage All calls to mas_adopt_children() currently pass the parent as the node in the maple state. Allow for the parent pointer that is passed in to be used instead. Link: https://lkml.kernel.org/r/20230804165951.2661157-6-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cf41e0dbb87b22..8e94f5495a9717 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1702,7 +1702,7 @@ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) { enum maple_type type = mte_node_type(parent); - struct maple_node *node = mas_mn(mas); + struct maple_node *node = mte_to_node(parent); void __rcu **slots = ma_slots(node, type); unsigned long *pivots = ma_pivots(node, type); struct maple_enode *child; From 530f745c7620af288b71b3d667cb90f10df3defe Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 4 Aug 2023 12:59:51 -0400 Subject: [PATCH 310/489] maple_tree: replace data before marking dead in split and spanning store Reorder the operations for split and spanning stores so that new data is placed in the tree prior to marking the old data as dead. This will limit re-walks on dead data to just once instead of a retry loop. The order of operations is as follows: Create the new data, put the new data in place, mark the top node of the old data as dead. Then repair parent links in the reused nodes through all levels of the tree, following the new nodes downwards. Finally walk the top dead node looking for nodes that are no longer used, or subtrees that should be destroyed (marked dead throughout then freed), follow the partially used nodes downwards to discover other dead nodes and subtrees. Link: https://lkml.kernel.org/r/20230804165951.2661157-7-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Paul E. McKenney Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 493 ++++++++++++++++------------------------------- 1 file changed, 168 insertions(+), 325 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8e94f5495a9717..ffb9d15bd81527 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -982,27 +982,9 @@ static inline void mat_add(struct ma_topiary *mat, mat->tail = dead_enode; } -static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); -static inline void mas_free(struct ma_state *mas, struct maple_enode *used); - -/* - * mas_mat_free() - Free all nodes in a dead list. - * @mas - the maple state - * @mat - the ma_topiary linked list of dead nodes to free. - * - * Free walk a dead list. - */ -static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat) -{ - struct maple_enode *next; - - while (mat->head) { - next = mte_to_mat(mat->head)->next; - mas_free(mas, mat->head); - mat->head = next; - } -} - +static void mt_free_walk(struct rcu_head *head); +static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, + bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. * @mas - the maple state @@ -1013,10 +995,15 @@ static void mas_mat_free(struct ma_state *mas, struct ma_topiary *mat) static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) { struct maple_enode *next; + struct maple_node *node; + bool in_rcu = mt_in_rcu(mas->tree); while (mat->head) { next = mte_to_mat(mat->head)->next; - mte_destroy_walk(mat->head, mat->mtree); + node = mte_to_node(mat->head); + mt_destroy_walk(mat->head, mas->tree, !in_rcu); + if (in_rcu) + call_rcu(&node->rcu, mt_free_walk); mat->head = next; } } @@ -1759,11 +1746,11 @@ static inline void mas_replace_node(struct ma_state *mas, } /* - * mas_new_child() - Find the new child of a node. - * @mas: the maple state + * mas_find_child() - Find a child who has the parent @mas->node. + * @mas: the maple state with the parent. * @child: the maple state to store the child. */ -static inline bool mas_new_child(struct ma_state *mas, struct ma_state *child) +static inline bool mas_find_child(struct ma_state *mas, struct ma_state *child) __must_hold(mas->tree->ma_lock) { enum maple_type mt; @@ -2065,56 +2052,6 @@ static inline void mab_mas_cp(struct maple_big_node *b_node, } } -/* - * mas_descend_adopt() - Descend through a sub-tree and adopt children. - * @mas: the maple state with the maple encoded node of the sub-tree. - * - * Descend through a sub-tree and adopt children who do not have the correct - * parents set. Follow the parents which have the correct parents as they are - * the new entries which need to be followed to find other incorrectly set - * parents. - */ -static inline void mas_descend_adopt(struct ma_state *mas) -{ - struct ma_state list[3], next[3]; - int i, n; - - /* - * At each level there may be up to 3 correct parent pointers which indicates - * the new nodes which need to be walked to find any new nodes at a lower level. - */ - - for (i = 0; i < 3; i++) { - list[i] = *mas; - list[i].offset = 0; - next[i].offset = 0; - } - next[0] = *mas; - - while (!mte_is_leaf(list[0].node)) { - n = 0; - for (i = 0; i < 3; i++) { - if (mas_is_none(&list[i])) - continue; - - if (i && list[i-1].node == list[i].node) - continue; - - while ((n < 3) && (mas_new_child(&list[i], &next[n]))) - n++; - - mas_adopt_children(&list[i], list[i].node); - } - - while (n < 3) - next[n++].node = MAS_NONE; - - /* descend by setting the list to the children */ - for (i = 0; i < 3; i++) - list[i] = next[i]; - } -} - /* * mas_bulk_rebalance() - Rebalance the end of a tree after a bulk insert. * @mas: The maple state @@ -2304,98 +2241,6 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) wr_mas->offset_end = mas->offset = offset; } -/* - * mas_topiary_range() - Add a range of slots to the topiary. - * @mas: The maple state - * @destroy: The topiary to add the slots (usually destroy) - * @start: The starting slot inclusively - * @end: The end slot inclusively - */ -static inline void mas_topiary_range(struct ma_state *mas, - struct ma_topiary *destroy, unsigned char start, unsigned char end) -{ - void __rcu **slots; - unsigned char offset; - - MAS_BUG_ON(mas, mte_is_leaf(mas->node)); - - slots = ma_slots(mas_mn(mas), mte_node_type(mas->node)); - for (offset = start; offset <= end; offset++) { - struct maple_enode *enode = mas_slot_locked(mas, slots, offset); - - if (mte_dead_node(enode)) - continue; - - mat_add(destroy, enode); - } -} - -/* - * mast_topiary() - Add the portions of the tree to the removal list; either to - * be freed or discarded (destroy walk). - * @mast: The maple_subtree_state. - */ -static inline void mast_topiary(struct maple_subtree_state *mast) -{ - MA_WR_STATE(wr_mas, mast->orig_l, NULL); - unsigned char r_start, r_end; - unsigned char l_start, l_end; - void __rcu **l_slots, **r_slots; - - wr_mas.type = mte_node_type(mast->orig_l->node); - mast->orig_l->index = mast->orig_l->last; - mas_wr_node_walk(&wr_mas); - l_start = mast->orig_l->offset + 1; - l_end = mas_data_end(mast->orig_l); - r_start = 0; - r_end = mast->orig_r->offset; - - if (r_end) - r_end--; - - l_slots = ma_slots(mas_mn(mast->orig_l), - mte_node_type(mast->orig_l->node)); - - r_slots = ma_slots(mas_mn(mast->orig_r), - mte_node_type(mast->orig_r->node)); - - if ((l_start < l_end) && - mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_start))) { - l_start++; - } - - if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_end))) { - if (r_end) - r_end--; - } - - if ((l_start > r_end) && (mast->orig_l->node == mast->orig_r->node)) - return; - - /* At the node where left and right sides meet, add the parts between */ - if (mast->orig_l->node == mast->orig_r->node) { - return mas_topiary_range(mast->orig_l, mast->destroy, - l_start, r_end); - } - - /* mast->orig_r is different and consumed. */ - if (mte_is_leaf(mast->orig_r->node)) - return; - - if (mte_dead_node(mas_slot_locked(mast->orig_l, l_slots, l_end))) - l_end--; - - - if (l_start <= l_end) - mas_topiary_range(mast->orig_l, mast->destroy, l_start, l_end); - - if (mte_dead_node(mas_slot_locked(mast->orig_r, r_slots, r_start))) - r_start++; - - if (r_start <= r_end) - mas_topiary_range(mast->orig_r, mast->destroy, 0, r_end); -} - /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state @@ -2431,7 +2276,7 @@ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) /* * mast_spanning_rebalance() - Rebalance nodes with nearest neighbour favouring * the node to the right. Checking the nodes to the right then the left at each - * level upwards until root is reached. Free and destroy as needed. + * level upwards until root is reached. * Data is copied into the @mast->bn. * @mast: The maple_subtree_state. */ @@ -2440,8 +2285,6 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) { struct ma_state r_tmp = *mast->orig_r; struct ma_state l_tmp = *mast->orig_l; - struct maple_enode *ancestor = NULL; - unsigned char start, end; unsigned char depth = 0; r_tmp = *mast->orig_r; @@ -2450,87 +2293,25 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) mas_ascend(mast->orig_r); mas_ascend(mast->orig_l); depth++; - if (!ancestor && - (mast->orig_r->node == mast->orig_l->node)) { - ancestor = mast->orig_r->node; - end = mast->orig_r->offset - 1; - start = mast->orig_l->offset + 1; - } - if (mast->orig_r->offset < mas_data_end(mast->orig_r)) { - if (!ancestor) { - ancestor = mast->orig_r->node; - start = 0; - } - mast->orig_r->offset++; do { mas_descend(mast->orig_r); mast->orig_r->offset = 0; - depth--; - } while (depth); + } while (--depth); mast_rebalance_next(mast); - do { - unsigned char l_off = 0; - struct maple_enode *child = r_tmp.node; - - mas_ascend(&r_tmp); - if (ancestor == r_tmp.node) - l_off = start; - - if (r_tmp.offset) - r_tmp.offset--; - - if (l_off < r_tmp.offset) - mas_topiary_range(&r_tmp, mast->destroy, - l_off, r_tmp.offset); - - if (l_tmp.node != child) - mat_add(mast->free, child); - - } while (r_tmp.node != ancestor); - *mast->orig_l = l_tmp; return true; - } else if (mast->orig_l->offset != 0) { - if (!ancestor) { - ancestor = mast->orig_l->node; - end = mas_data_end(mast->orig_l); - } - mast->orig_l->offset--; do { mas_descend(mast->orig_l); mast->orig_l->offset = mas_data_end(mast->orig_l); - depth--; - } while (depth); + } while (--depth); mast_rebalance_prev(mast); - do { - unsigned char r_off; - struct maple_enode *child = l_tmp.node; - - mas_ascend(&l_tmp); - if (ancestor == l_tmp.node) - r_off = end; - else - r_off = mas_data_end(&l_tmp); - - if (l_tmp.offset < r_off) - l_tmp.offset++; - - if (l_tmp.offset < r_off) - mas_topiary_range(&l_tmp, mast->destroy, - l_tmp.offset, r_off); - - if (r_tmp.node != child) - mat_add(mast->free, child); - - } while (l_tmp.node != ancestor); - *mast->orig_r = r_tmp; return true; } @@ -2542,36 +2323,24 @@ bool mast_spanning_rebalance(struct maple_subtree_state *mast) } /* - * mast_ascend_free() - Add current original maple state nodes to the free list - * and ascend. + * mast_ascend() - Ascend the original left and right maple states. * @mast: the maple subtree state. * - * Ascend the original left and right sides and add the previous nodes to the - * free list. Set the slots to point to the correct location in the new nodes. + * Ascend the original left and right sides. Set the offsets to point to the + * data already in the new tree (@mast->l and @mast->r). */ -static inline void -mast_ascend_free(struct maple_subtree_state *mast) +static inline void mast_ascend(struct maple_subtree_state *mast) { MA_WR_STATE(wr_mas, mast->orig_r, NULL); - struct maple_enode *left = mast->orig_l->node; - struct maple_enode *right = mast->orig_r->node; - mas_ascend(mast->orig_l); mas_ascend(mast->orig_r); - mat_add(mast->free, left); - - if (left != right) - mat_add(mast->free, right); mast->orig_r->offset = 0; mast->orig_r->index = mast->r->max; /* last should be larger than or equal to index */ if (mast->orig_r->last < mast->orig_r->index) mast->orig_r->last = mast->orig_r->index; - /* - * The node may not contain the value so set slot to ensure all - * of the nodes contents are freed or destroyed. - */ + wr_mas.type = mte_node_type(mast->orig_r->node); mas_wr_node_walk(&wr_mas); /* Set up the left side of things */ @@ -2750,66 +2519,152 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, } /* - * mas_wmb_replace() - Write memory barrier and replace - * @mas: The maple state - * @free: the maple topiary list of nodes to free - * @destroy: The maple topiary list of nodes to destroy (walk and free) + * mas_topiary_node() - Dispose of a singe node + * @mas: The maple state for pushing nodes + * @enode: The encoded maple node + * @in_rcu: If the tree is in rcu mode * - * Updates gap as necessary. + * The node will either be RCU freed or pushed back on the maple state. */ -static inline void mas_wmb_replace(struct ma_state *mas, - struct ma_topiary *free, - struct ma_topiary *destroy) +static inline void mas_topiary_node(struct ma_state *mas, + struct maple_enode *enode, bool in_rcu) { - struct maple_enode *old_enode; + struct maple_node *tmp; - if (mte_is_root(mas->node)) { - old_enode = mas_root_locked(mas); - } else { - unsigned char offset = mte_parent_slot(mas->node); - void __rcu **slots = ma_slots(mte_parent(mas->node), - mas_parent_type(mas, mas->node)); + if (enode == MAS_NONE) + return; - old_enode = mas_slot_locked(mas, slots, offset); - } + tmp = mte_to_node(enode); + mte_set_node_dead(enode); + if (in_rcu) + ma_free_rcu(tmp); + else + mas_push_node(mas, tmp); +} - /* Insert the new data in the tree */ +/* + * mas_topiary_replace() - Replace the data with new data, then repair the + * parent links within the new tree. Iterate over the dead sub-tree and collect + * the dead subtrees and topiary the nodes that are no longer of use. + * + * The new tree will have up to three children with the correct parent. Keep + * track of the new entries as they need to be followed to find the next level + * of new entries. + * + * The old tree will have up to three children with the old parent. Keep track + * of the old entries as they may have more nodes below replaced. Nodes within + * [index, last] are dead subtrees, others need to be freed and followed. + * + * @mas: The maple state pointing at the new data + * @old_enode: The maple encoded node being replaced + * + */ +static inline void mas_topiary_replace(struct ma_state *mas, + struct maple_enode *old_enode) +{ + struct ma_state tmp[3], tmp_next[3]; + MA_TOPIARY(subtrees, mas->tree); + bool in_rcu; + int i, n; + + /* Place data in tree & then mark node as old */ mas_put_in_tree(mas, old_enode); - if (!mte_is_leaf(mas->node)) - mas_descend_adopt(mas); + /* Update the parent pointers in the tree */ + tmp[0] = *mas; + tmp[0].offset = 0; + tmp[1].node = MAS_NONE; + tmp[2].node = MAS_NONE; + while (!mte_is_leaf(tmp[0].node)) { + n = 0; + for (i = 0; i < 3; i++) { + if (mas_is_none(&tmp[i])) + continue; + + while (n < 3) { + if (!mas_find_child(&tmp[i], &tmp_next[n])) + break; + n++; + } + + mas_adopt_children(&tmp[i], tmp[i].node); + } - mas_mat_free(mas, free); + if (MAS_WARN_ON(mas, n == 0)) + break; - if (destroy) - mas_mat_destroy(mas, destroy); + while (n < 3) + tmp_next[n++].node = MAS_NONE; - if (mte_is_leaf(mas->node)) - return; + for (i = 0; i < 3; i++) + tmp[i] = tmp_next[i]; + } - mas_update_gap(mas); + /* Collect the old nodes that need to be discarded */ + if (mte_is_leaf(old_enode)) + return mas_free(mas, old_enode); + + tmp[0] = *mas; + tmp[0].offset = 0; + tmp[0].node = old_enode; + tmp[1].node = MAS_NONE; + tmp[2].node = MAS_NONE; + in_rcu = mt_in_rcu(mas->tree); + do { + n = 0; + for (i = 0; i < 3; i++) { + if (mas_is_none(&tmp[i])) + continue; + + while (n < 3) { + if (!mas_find_child(&tmp[i], &tmp_next[n])) + break; + + if ((tmp_next[n].min >= tmp_next->index) && + (tmp_next[n].max <= tmp_next->last)) { + mat_add(&subtrees, tmp_next[n].node); + tmp_next[n].node = MAS_NONE; + } else { + n++; + } + } + } + + if (MAS_WARN_ON(mas, n == 0)) + break; + + while (n < 3) + tmp_next[n++].node = MAS_NONE; + + for (i = 0; i < 3; i++) { + mas_topiary_node(mas, tmp[i].node, in_rcu); + tmp[i] = tmp_next[i]; + } + } while (!mte_is_leaf(tmp[0].node)); + + for (i = 0; i < 3; i++) + mas_topiary_node(mas, tmp[i].node, in_rcu); + + mas_mat_destroy(mas, &subtrees); } /* - * mast_new_root() - Set a new tree root during subtree creation - * @mast: The maple subtree state + * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state + * @old: The old maple encoded node that is being replaced. + * + * Updates gap as necessary. */ -static inline void mast_new_root(struct maple_subtree_state *mast, - struct ma_state *mas) +static inline void mas_wmb_replace(struct ma_state *mas, + struct maple_enode *old_enode) { - mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); - if (!mte_dead_node(mast->orig_l->node) && - !mte_is_root(mast->orig_l->node)) { - do { - mast_ascend_free(mast); - mast_topiary(mast); - } while (!mte_is_root(mast->orig_l->node)); - } - if ((mast->orig_l->node != mas->node) && - (mast->l->depth > mas_mt_height(mas))) { - mat_add(mast->free, mas->node); - } + /* Insert the new data in the tree */ + mas_topiary_replace(mas, old_enode); + + if (mte_is_leaf(mas->node)) + return; + + mas_update_gap(mas); } /* @@ -2995,12 +2850,11 @@ static int mas_spanning_rebalance(struct ma_state *mas, unsigned char split, mid_split; unsigned char slot = 0; struct maple_enode *left = NULL, *middle = NULL, *right = NULL; + struct maple_enode *old_enode; MA_STATE(l_mas, mas->tree, mas->index, mas->index); MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(m_mas, mas->tree, mas->index, mas->index); - MA_TOPIARY(free, mas->tree); - MA_TOPIARY(destroy, mas->tree); /* * The tree needs to be rebalanced and leaves need to be kept at the same level. @@ -3009,8 +2863,6 @@ static int mas_spanning_rebalance(struct ma_state *mas, mast->l = &l_mas; mast->m = &m_mas; mast->r = &r_mas; - mast->free = &free; - mast->destroy = &destroy; l_mas.node = r_mas.node = m_mas.node = MAS_NONE; /* Check if this is not root and has sufficient data. */ @@ -3018,7 +2870,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, unlikely(mast->bn->b_end <= mt_min_slots[mast->bn->type])) mast_spanning_rebalance(mast); - mast->orig_l->depth = 0; + l_mas.depth = 0; /* * Each level of the tree is examined and balanced, pushing data to the left or @@ -3029,7 +2881,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, * original tree and the partially new tree. To remedy the parent pointers in * the old tree, the new data is swapped into the active tree and a walk down * the tree is performed and the parent pointers are updated. - * See mas_descend_adopt() for more information.. + * See mas_topiary_replace() for more information. */ while (count--) { mast->bn->b_end--; @@ -3046,13 +2898,13 @@ static int mas_spanning_rebalance(struct ma_state *mas, */ memset(mast->bn, 0, sizeof(struct maple_big_node)); mast->bn->type = mte_node_type(left); - mast->orig_l->depth++; + l_mas.depth++; /* Root already stored in l->node. */ if (mas_is_root_limits(mast->l)) goto new_root; - mast_ascend_free(mast); + mast_ascend(mast); mast_combine_cp_left(mast); l_mas.offset = mast->bn->b_end; mab_set_b_end(mast->bn, &l_mas, left); @@ -3061,7 +2913,6 @@ static int mas_spanning_rebalance(struct ma_state *mas, /* Copy anything necessary out of the right node. */ mast_combine_cp_right(mast); - mast_topiary(mast); mast->orig_l->last = mast->orig_l->max; if (mast_sufficient(mast)) @@ -3083,7 +2934,7 @@ static int mas_spanning_rebalance(struct ma_state *mas, l_mas.node = mt_mk_node(ma_mnode_ptr(mas_pop_node(mas)), mte_node_type(mast->orig_l->node)); - mast->orig_l->depth++; + l_mas.depth++; mab_mas_cp(mast->bn, 0, mt_slots[mast->bn->type] - 1, &l_mas, true); mas_set_parent(mas, left, l_mas.node, slot); if (middle) @@ -3094,23 +2945,20 @@ static int mas_spanning_rebalance(struct ma_state *mas, if (mas_is_root_limits(mast->l)) { new_root: - mast_new_root(mast, mas); + mas_mn(mast->l)->parent = ma_parent_ptr(mas_tree_parent(mas)); + while (!mte_is_root(mast->orig_l->node)) + mast_ascend(mast); } else { mas_mn(&l_mas)->parent = mas_mn(mast->orig_l)->parent; } - if (!mte_dead_node(mast->orig_l->node)) - mat_add(&free, mast->orig_l->node); - - mas->depth = mast->orig_l->depth; - *mast->orig_l = l_mas; - mte_set_node_dead(mas->node); - - /* Set up mas for insertion. */ - mast->orig_l->depth = mas->depth; - mast->orig_l->alloc = mas->alloc; - *mas = *mast->orig_l; - mas_wmb_replace(mas, &free, &destroy); + old_enode = mast->orig_l->node; + mas->depth = l_mas.depth; + mas->node = l_mas.node; + mas->min = l_mas.min; + mas->max = l_mas.max; + mas->offset = l_mas.offset; + mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); return mast->bn->b_end; } @@ -3341,7 +3189,6 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast, unsigned char skip) { bool cp = true; - struct maple_enode *old = mas->node; unsigned char split; memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap)); @@ -3353,7 +3200,6 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast, cp = false; } else { mas_ascend(mas); - mat_add(mast->free, old); mas->offset = mte_parent_slot(mas->node); } @@ -3457,13 +3303,11 @@ static inline bool mas_push_data(struct ma_state *mas, int height, split = mt_slots[mast->bn->type] - 2; if (left) { /* Switch mas to prev node */ - mat_add(mast->free, mas->node); *mas = tmp_mas; /* Start using mast->l for the left side. */ tmp_mas.node = mast->l->node; *mast->l = tmp_mas; } else { - mat_add(mast->free, tmp_mas.node); tmp_mas.node = mast->r->node; *mast->r = tmp_mas; split = slot_total - split; @@ -3490,6 +3334,7 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; + struct maple_enode *old; /* * Splitting is handled differently from any other B-tree; the Maple @@ -3512,7 +3357,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) MA_STATE(r_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_l_mas, mas->tree, mas->index, mas->last); MA_STATE(prev_r_mas, mas->tree, mas->index, mas->last); - MA_TOPIARY(mat, mas->tree); trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); @@ -3525,7 +3369,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) mast.r = &r_mas; mast.orig_l = &prev_l_mas; mast.orig_r = &prev_r_mas; - mast.free = &mat; mast.bn = b_node; while (height++ <= mas->depth) { @@ -3565,9 +3408,9 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) } /* Set the original node as dead */ - mat_add(mast.free, mas->node); + old = mas->node; mas->node = l_mas.node; - mas_wmb_replace(mas, mast.free, NULL); + mas_wmb_replace(mas, old); mtree_range_walk(mas); return 1; } @@ -3903,6 +3746,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) return NULL; } +static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); /* * mas_new_root() - Create a new root node that only contains the entry passed * in. @@ -3969,7 +3813,6 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) /* Left and Right side of spanning store */ MA_STATE(l_mas, NULL, 0, 0); MA_STATE(r_mas, NULL, 0, 0); - MA_WR_STATE(r_wr_mas, &r_mas, wr_mas->entry); MA_WR_STATE(l_wr_mas, &l_mas, wr_mas->entry); From 7c0a84bd0dc214a710305fbc0f407b8e7c410762 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:48 +0800 Subject: [PATCH 311/489] mm/compaction: correct last_migrated_pfn update in compact_zone We record start pfn of last isolated page block with last_migrated_pfn. And then: 1. We check if we mark the page block skip for exclusive access in isolate_migratepages_block by test if next migrate pfn is still in last isolated page block. If so, we will set finish_pageblock to do the rescan. 2. We check if a full cc->order block is scanned by test if last scan range passes the cc->order block boundary. If so, we flush the pages were freed. We treat cc->migrate_pfn before isolate_migratepages as the start pfn of last isolated page range. However, we always align migrate_pfn to page block or move to another page block in fast_find_migrateblock or in linearly scan forward in isolate_migratepages before do page isolation in isolate_migratepages_block. Update last_migrated_pfn with pageblock_start_pfn(cc->migrate_pfn - 1) after scan to correctly set start pfn of last isolated page range. To avoid that: 1. Miss a rescan with finish_pageblock set as last_migrate_pfn does not point to right pageblock and the migrate will not be in pageblock of last_migrate_pfn as it should be. 2. Wrongly issue flush by test cc->order block boundary with wrong last_migrate_pfn. Link: https://lkml.kernel.org/r/20230804110454.2935878-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index dc16efd5fac56a..bc2eda71179ae4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2507,7 +2507,8 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) goto check_drain; case ISOLATE_SUCCESS: update_cached = false; - last_migrated_pfn = iteration_start_pfn; + last_migrated_pfn = max(cc->zone->zone_start_pfn, + pageblock_start_pfn(cc->migrate_pfn - 1)); } err = migrate_pages(&cc->migratepages, compaction_alloc, From 7545e2f20aebf4da413be00384c4245eda5beb4d Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:49 +0800 Subject: [PATCH 312/489] mm/compaction: skip page block marked skip in isolate_migratepages_block Move migrate_pfn to page block end when block is marked skip to avoid unnecessary scan retry of that block from upper caller. For example, compact_zone may wrongly rescan skip page block with finish_pageblock set as following: 1. cc->migrate point to the start of page block 2. compact_zone record last_migrated_pfn to cc->migrate 3. compact_zone->isolate_migratepages->isolate_migratepages_block tries to scan the block. The low_pfn maybe moved forward to middle of block because of free pages at beginning of block. 4. we find first lru page could be isolated but block was exclusive marked skip. 5. abort isolate_migratepages_block and make cc->migrate_pfn point to found lru page at middle of block. 6. compact_zone find cc->migrate_pfn and last_migrated_pfn are in the same block and wrongly rescan the block with finish_pageblock set. Link: https://lkml.kernel.org/r/20230804110454.2935878-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/compaction.c b/mm/compaction.c index bc2eda71179ae4..78826c433ef693 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1140,6 +1140,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, skip_updated = true; if (test_and_set_skip(cc, valid_page) && !cc->finish_pageblock) { + low_pfn = end_pfn; goto isolate_abort; } } From 0aa8ea3c5d353d5f0aa1e607f8dc5f43bf6cdf05 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:50 +0800 Subject: [PATCH 313/489] mm/compaction: correct comment of fast_find_migrateblock in isolate_migratepages After 90ed667c03fe5 ("Revert "Revert "mm/compaction: fix set skip in fast_find_migrateblock"""), we remove skip set in fast_find_migrateblock. Correct comment that fast_find_block is used to avoid isolation_suitable check for pageblock returned from fast_find_migrateblock because fast_find_migrateblock will mark found pageblock skipped. Instead, comment that fast_find_block is used to avoid a redundant check of fast found pageblock which is already checked skip flag inside fast_find_migrateblock. Link: https://lkml.kernel.org/r/20230804110454.2935878-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 78826c433ef693..3b204fbaa47020 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1987,9 +1987,9 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) block_start_pfn = cc->zone->zone_start_pfn; /* - * fast_find_migrateblock marks a pageblock skipped so to avoid - * the isolation_suitable check below, check whether the fast - * search was successful. + * fast_find_migrateblock() has already ensured the pageblock is not + * set with a skipped flag, so to avoid the isolation_suitable check + * below again, check whether the fast search was successful. */ fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail; From cf043a007e00ae7fe5a4aa5447068fcd13ce031b Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:51 +0800 Subject: [PATCH 314/489] mm/compaction: correct comment of cached migrate pfn update Commit e380bebe47715 ("mm, compaction: keep migration source private to a single compaction instance") moved update of async and sync compact_cached_migrate_pfn from update_pageblock_skip to update_cached_migrate but left the comment behind. Move the relevant comment to correct this. Link: https://lkml.kernel.org/r/20230804110454.2935878-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 3b204fbaa47020..db44319dc716c4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -469,6 +469,7 @@ static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) pfn = pageblock_end_pfn(pfn); + /* Update where async and sync compaction should restart */ if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; if (cc->mode != MIGRATE_ASYNC && @@ -490,7 +491,6 @@ static void update_pageblock_skip(struct compact_control *cc, set_pageblock_skip(page); - /* Update where async and sync compaction should restart */ if (pfn < zone->compact_cached_free_pfn) zone->compact_cached_free_pfn = pfn; } From c3750cc7725af8da06f2f36ddce7adc52a3a51d6 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:52 +0800 Subject: [PATCH 315/489] mm/compaction: correct comment to complete migration failure Commit cfccd2e63e7e0 ("mm, compaction: finish pageblocks on complete migration failure") convert cc->order aligned check to page block order aligned check. Correct comment relevant with it. Link: https://lkml.kernel.org/r/20230804110454.2935878-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index db44319dc716c4..d0d3fea64b4089 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2532,7 +2532,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) } /* * If an ASYNC or SYNC_LIGHT fails to migrate a page - * within the current order-aligned block and + * within the pageblock_order-aligned block and * fast_find_migrateblock may be used then scan the * remainder of the pageblock. This will mark the * pageblock "skip" to avoid rescanning in the near From f82024cbfa3a410d947b588658949a8a391da8a7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:53 +0800 Subject: [PATCH 316/489] mm/compaction: remove unnecessary return for void function Remove unnecessary return for void function Link: https://lkml.kernel.org/r/20230804110454.2935878-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index d0d3fea64b4089..91a9dfa41ef473 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1444,8 +1444,6 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) /* Skip this pageblock in the future as it's full or nearly full */ if (start_pfn == end_pfn) set_pageblock_skip(page); - - return; } /* Search orders in round-robin fashion */ @@ -2898,7 +2896,7 @@ int compaction_register_node(struct node *node) void compaction_unregister_node(struct node *node) { - return device_remove_file(&node->dev, &dev_attr_compact); + device_remove_file(&node->dev, &dev_attr_compact); } #endif /* CONFIG_SYSFS && CONFIG_NUMA */ From 18c59d58baa60a8bfaec58d29b6b94877664eed8 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 4 Aug 2023 19:04:54 +0800 Subject: [PATCH 317/489] mm/compaction: only set skip flag if cc->no_set_skip_hint is false Keep the same logic as update_pageblock_skip, only set skip if no_set_skip_hint is false which is more reasonable. Link: https://lkml.kernel.org/r/20230804110454.2935878-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Cc: Baolin Wang Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 91a9dfa41ef473..fe7b4e7c5d2401 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1442,7 +1442,7 @@ fast_isolate_around(struct compact_control *cc, unsigned long pfn) isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false); /* Skip this pageblock in the future as it's full or nearly full */ - if (start_pfn == end_pfn) + if (start_pfn == end_pfn && !cc->no_set_skip_hint) set_pageblock_skip(page); } From 0db31d63f27e5b8ca84b9fd5a3cff5b12ac88abf Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Wed, 2 Aug 2023 15:23:28 +0800 Subject: [PATCH 318/489] mm: disable kernelcore=mirror when no mirror memory For system with kernelcore=mirror enabled while no mirrored memory is reported by efi. This could lead to kernel OOM during startup since all memory beside zone DMA are in the movable zone and this prevents the kernel to use it. Zone DMA/DMA32 initialization is independent of mirrored memory and their max pfn is set in zone_sizes_init(). Since kernel can fallback to zone DMA/DMA32 if there is no memory in zone Normal, these zones are seen as mirrored memory no mather their memory attributes are. To solve this problem, disable kernelcore=mirror when there is no real mirrored memory exists. Link: https://lkml.kernel.org/r/20230802072328.2107981-1-mawupeng1@huawei.com Signed-off-by: Ma Wupeng Suggested-by: Kefeng Wang Suggested-by: Mike Rapoport Reviewed-by: Mike Rapoport (IBM) Reviewed-by: Kefeng Wang Cc: Levi Yun Signed-off-by: Andrew Morton --- mm/internal.h | 1 + mm/memblock.c | 5 +++++ mm/mm_init.c | 5 +++++ 3 files changed, 11 insertions(+) diff --git a/mm/internal.h b/mm/internal.h index 5a03bc4782a28b..a037b1b37f6d78 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1022,6 +1022,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, } extern bool mirrored_kernelcore; +extern bool memblock_has_mirror(void); static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) { diff --git a/mm/memblock.c b/mm/memblock.c index f9e61e565a5343..913b2520a9a002 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -161,6 +161,11 @@ static int memblock_can_resize __initdata_memblock; static int memblock_memory_in_slab __initdata_memblock; static int memblock_reserved_in_slab __initdata_memblock; +bool __init_memblock memblock_has_mirror(void) +{ + return system_has_some_mirror; +} + static enum memblock_flags __init_memblock choose_memblock_flags(void) { return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE; diff --git a/mm/mm_init.c b/mm/mm_init.c index a2fbaa8d917fc9..2a19f3151661a0 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -376,6 +376,11 @@ static void __init find_zone_movable_pfns_for_nodes(void) if (mirrored_kernelcore) { bool mem_below_4gb_not_mirrored = false; + if (!memblock_has_mirror()) { + pr_warn("The system has no mirror memory, ignore kernelcore=mirror.\n"); + goto out; + } + for_each_mem_region(r) { if (memblock_is_mirror(r)) continue; From 61f297380118060a70888e0c1f5c534b74ab78fe Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:53 +0800 Subject: [PATCH 319/489] mm: remove redundant K() macro definition Patch series "cleanup with helper macro K()". Use helper macro K() to improve code readability. No functional modification involved. Remove redundant K() macro definition. This patch (of 7): Since commit eb8589b4f8c1 ("mm: move mem_init_print_info() to mm_init.c"), the K() macro definition has been moved to mm/internal.h. Therefore, the definitions in mm/memcontrol.c, mm/backing-dev.c and mm/oom_kill.c are redundant. Drop redundant definitions. [akpm@linux-foundation.org: oom_kill.c: remove "#undef K", per Kefeng] Link: https://lkml.kernel.org/r/20230804012559.2617515-1-zhangpeng362@huawei.com Link: https://lkml.kernel.org/r/20230804012559.2617515-2-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/backing-dev.c | 3 +-- mm/memcontrol.c | 1 - mm/oom_kill.c | 3 --- 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3ffc3cfa7a1479..fc44bfbf785ed2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -16,6 +16,7 @@ #include #include #include +#include "internal.h" struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); @@ -34,8 +35,6 @@ LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; -#define K(x) ((x) << (PAGE_SHIFT - 10)) - #ifdef CONFIG_DEBUG_FS #include #include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index da9f983a090e1d..d83f995809009d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1629,7 +1629,6 @@ static void memory_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) WARN_ON_ONCE(seq_buf_has_overflowed(s)); } -#define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_context: Print OOM information relevant to * memory controller. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 612b5597d3af44..44bde56ecd025a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -479,8 +479,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); static bool oom_killer_disabled __read_mostly; -#define K(x) ((x) << (PAGE_SHIFT-10)) - /* * task->mm can be NULL if the task is the exited group leader. So to * determine whether the task is using a particular mm, we examine all the @@ -994,7 +992,6 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) mmdrop(mm); put_task_struct(victim); } -#undef K /* * Kill provided task unless it's secured by setting From 00cde0429bc50792ed8786e909e70141acf3741a Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:54 +0800 Subject: [PATCH 320/489] mm/swapfile.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-3-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swapfile.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index e04eb9c0482db2..b52145c6bac220 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include #include #include +#include "internal.h" #include "swap.h" static bool swap_count_continued(struct swap_info_struct *, pgoff_t, @@ -2635,8 +2636,8 @@ static int swap_show(struct seq_file *swap, void *v) return 0; } - bytes = si->pages << (PAGE_SHIFT - 10); - inuse = READ_ONCE(si->inuse_pages) << (PAGE_SHIFT - 10); + bytes = K(si->pages); + inuse = K(READ_ONCE(si->inuse_pages)); file = si->swap_file; len = seq_file_path(swap, file, " \t\n\\"); @@ -2861,8 +2862,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, } if (last_page > maxpages) { pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", - maxpages << (PAGE_SHIFT - 10), - last_page << (PAGE_SHIFT - 10)); + K(maxpages), K(last_page)); } if (maxpages > last_page) { maxpages = last_page + 1; @@ -3184,8 +3184,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, cluster_info); pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n", - p->pages<<(PAGE_SHIFT-10), name->name, p->prio, - nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), + K(p->pages), name->name, p->prio, nr_extents, + K((unsigned long long)span), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", (p->flags & SWP_AREA_DISCARD) ? "s" : "", From 3cb8eaa4558e4d1cde9641e1cb0dcbbd74ae5723 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:55 +0800 Subject: [PATCH 321/489] mm/swap_state.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-4-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/swap_state.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index f8ea7015bad487..d157862ba0a698 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -63,9 +63,8 @@ static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); - printk("Free swap = %ldkB\n", - get_nr_swap_pages() << (PAGE_SHIFT - 10)); - printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); + printk("Free swap = %ldkB\n", K(get_nr_swap_pages())); + printk("Total swap = %lukB\n", K(total_swap_pages)); } void *get_shadow_from_swap_cache(swp_entry_t entry) From b91742d84d29c39b643992b95560cfb7337eab18 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:56 +0800 Subject: [PATCH 322/489] mm/shmem.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-5-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 235f2b2fd20251..20daa207d8bf30 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3864,8 +3864,7 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) struct mempolicy *mpol; if (sbinfo->max_blocks != shmem_default_max_blocks()) - seq_printf(seq, ",size=%luk", - sbinfo->max_blocks << (PAGE_SHIFT - 10)); + seq_printf(seq, ",size=%luk", K(sbinfo->max_blocks)); if (sbinfo->max_inodes != shmem_default_max_inodes()) seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes); if (sbinfo->mode != (0777 | S_ISVTX)) From d5a6474d3d36623b368c19dd3e9f5a09c6013120 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:57 +0800 Subject: [PATCH 323/489] mm/nommu.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-6-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/nommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 1fe0ee2398600c..8dba41cfc44d12 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1800,7 +1800,7 @@ static int __meminit init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -1821,7 +1821,7 @@ static int __meminit init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; From b1773e0ea30a46cf23238ff38d5ca4756c41ad2e Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:58 +0800 Subject: [PATCH 324/489] mm/mmap.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-7-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index bc91d91261ab79..35b6bc9c7c9510 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3785,7 +3785,7 @@ static int init_user_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17); return 0; @@ -3806,7 +3806,7 @@ static int init_admin_reserve(void) { unsigned long free_kbytes; - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13); return 0; @@ -3850,7 +3850,7 @@ static int reserve_mem_notifier(struct notifier_block *nb, break; case MEM_OFFLINE: - free_kbytes = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10); + free_kbytes = K(global_zone_page_state(NR_FREE_PAGES)); if (sysctl_user_reserve_kbytes > free_kbytes) { init_user_reserve(); From 6c1aa2d37f7677609c74a4ff120f99a07b90ba08 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Fri, 4 Aug 2023 09:25:59 +0800 Subject: [PATCH 325/489] mm/hugetlb.c: use helper macro K() Use helper macro K() to improve code readability. No functional modification involved. Link: https://lkml.kernel.org/r/20230804012559.2617515-8-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 102f83bd3a9f4a..851457af0869cf 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4748,7 +4748,7 @@ void hugetlb_show_meminfo_node(int nid) void hugetlb_report_usage(struct seq_file *m, struct mm_struct *mm) { seq_printf(m, "HugetlbPages:\t%8lu kB\n", - atomic_long_read(&mm->hugetlb_usage) << (PAGE_SHIFT - 10)); + K(atomic_long_read(&mm->hugetlb_usage))); } /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ From ce2fc5fffdfa9fc1412aff108afa102ddf82fd2b Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:20 -0700 Subject: [PATCH 326/489] mm: for !CONFIG_PER_VMA_LOCK equate write lock assertion for vma and mmap When CONFIG_PER_VMA_LOCK=n, vma_assert_write_locked() should be equivalent to mmap_assert_write_locked(). Link: https://lkml.kernel.org/r/20230804152724.3090321-3-surenb@google.com Suggested-by: Jann Horn Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Linus Torvalds Signed-off-by: Andrew Morton --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f64d1de3af09d7..49eafc62b4e65f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -738,7 +738,8 @@ static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} static inline void vma_start_write(struct vm_area_struct *vma) {} -static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} +static inline void vma_assert_write_locked(struct vm_area_struct *vma) + { mmap_assert_write_locked(vma->vm_mm); } static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) {} From e727bfd5e73a35ecbc4a01a15c659b9fafaa97c0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:21 -0700 Subject: [PATCH 327/489] mm: replace mmap with vma write lock assertions when operating on a vma Vma write lock assertion always includes mmap write lock assertion and additional vma lock checks when per-VMA locks are enabled. Replace weaker mmap_assert_write_locked() assertions with stronger vma_assert_write_locked() ones when we are operating on a vma which is expected to be locked. Link: https://lkml.kernel.org/r/20230804152724.3090321-4-surenb@google.com Suggested-by: Jann Horn Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Linus Torvalds Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- mm/memory.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 851457af0869cf..abfdcaf114f138 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5029,7 +5029,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_vma->vm_start, src_vma->vm_end); mmu_notifier_invalidate_range_start(&range); - mmap_assert_write_locked(src); + vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src->write_protect_seq); } else { /* diff --git a/mm/memory.c b/mm/memory.c index 1113ee625a94f5..039dcbbcc7d2ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1312,7 +1312,7 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ - mmap_assert_write_locked(src_mm); + vma_assert_write_locked(src_vma); raw_write_seqcount_begin(&src_mm->write_protect_seq); } From 60081bf19b0ec8fa40c589bd361fa2bc763f1050 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:22 -0700 Subject: [PATCH 328/489] mm: lock vma explicitly before doing vm_flags_reset and vm_flags_reset_once Implicit vma locking inside vm_flags_reset() and vm_flags_reset_once() is not obvious and makes it hard to understand where vma locking is happening. Also in some cases (like in dup_userfaultfd()) vma should be locked earlier than vma_flags modification. To make locking more visible, change these functions to assert that the vma write lock is taken and explicitly lock the vma beforehand. Fix userfaultfd functions which should lock the vma earlier. Link: https://lkml.kernel.org/r/20230804152724.3090321-5-surenb@google.com Suggested-by: Linus Torvalds Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 1 + fs/userfaultfd.c | 6 ++++++ include/linux/mm.h | 10 +++++++--- mm/madvise.c | 5 ++--- mm/mlock.c | 3 ++- mm/mprotect.c | 1 + 6 files changed, 19 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 709ebd578394b6..e2d6f9327f778e 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -410,6 +410,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm, ret = H_STATE; break; } + vma_start_write(vma); /* Copy vm_flags to avoid partial modifications in ksm_madvise */ vm_flags = vma->vm_flags; ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 9854d44ae18ecc..70bd2951b68d62 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -667,6 +667,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, mmap_write_lock(mm); for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); @@ -702,6 +703,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) octx = vma->vm_userfaultfd_ctx.ctx; if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); return 0; @@ -783,6 +785,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ + vma_start_write(vma); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); } @@ -940,6 +943,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; } + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1511,6 +1515,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx.ctx = ctx; @@ -1694,6 +1699,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * the next vma was merged into the current one and * the current one has not been updated yet. */ + vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; diff --git a/include/linux/mm.h b/include/linux/mm.h index 49eafc62b4e65f..5a6ff914009004 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -774,18 +774,22 @@ static inline void vm_flags_init(struct vm_area_struct *vma, ACCESS_PRIVATE(vma, __vm_flags) = flags; } -/* Use when VMA is part of the VMA tree and modifications need coordination */ +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ static inline void vm_flags_reset(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); vm_flags_init(vma, flags); } static inline void vm_flags_reset_once(struct vm_area_struct *vma, vm_flags_t flags) { - vma_start_write(vma); + vma_assert_write_locked(vma); WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); } diff --git a/mm/madvise.c b/mm/madvise.c index da65f8bd9ac33b..8498f700c284f0 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -173,9 +173,8 @@ static int madvise_update_vma(struct vm_area_struct *vma, } success: - /* - * vm_flags is protected by the mmap_lock held in write mode. - */ + /* vm_flags is protected by the mmap_lock held in write mode. */ + vma_start_write(vma); vm_flags_reset(vma, new_flags); if (!vma->vm_file || vma_is_anon_shmem(vma)) { error = replace_anon_vma_name(vma, anon_name); diff --git a/mm/mlock.c b/mm/mlock.c index 0a0c996c5c2140..1746600a2e1470 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -386,6 +386,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, */ if (newflags & VM_LOCKED) newflags |= VM_IO; + vma_start_write(vma); vm_flags_reset_once(vma, newflags); lru_add_drain(); @@ -460,9 +461,9 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, populate_vma_page_range will bring it back. */ - if ((newflags & VM_LOCKED) && (oldflags & VM_LOCKED)) { /* No work to do, and mlocking twice would be wrong */ + vma_start_write(vma); vm_flags_reset(vma, newflags); } else { mlock_vma_pages_range(vma, start, end, newflags); diff --git a/mm/mprotect.c b/mm/mprotect.c index 3f36c88a238e97..7cd7f644da800d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -656,6 +656,7 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. */ + vma_start_write(vma); vm_flags_reset(vma, newflags); if (vma_wants_manual_pte_write_upgrade(vma)) mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE; From ad9f006351c3368171458ae7ab14d72f031b239f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:23 -0700 Subject: [PATCH 329/489] mm: always lock new vma before inserting into vma tree While it's not strictly necessary to lock a newly created vma before adding it into the vma tree (as long as no further changes are performed to it), it seems like a good policy to lock it and prevent accidental changes after it becomes visible to the page faults. Lock the vma before adding it into the vma tree. [akpm@linux-foundation.org: fix reject fixing in vma_link(), per Jann] Link: https://lkml.kernel.org/r/20230804152724.3090321-6-surenb@google.com Suggested-by: Jann Horn Signed-off-by: Suren Baghdasaryan Reviewed-by: Liam R. Howlett Cc: Linus Torvalds Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 35b6bc9c7c9510..ef584aca1cd32d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -401,6 +401,8 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) if (vma_iter_prealloc(&vmi, vma)) return -ENOMEM; + vma_start_write(vma); + vma_iter_store(&vmi, vma); if (vma->vm_file) { @@ -463,7 +465,8 @@ static inline void vma_prepare(struct vma_prepare *vp) vma_start_write(vp->vma); if (vp->adj_next) vma_start_write(vp->adj_next); - /* vp->insert is always a newly created VMA, no need for locking */ + if (vp->insert) + vma_start_write(vp->insert); if (vp->remove) vma_start_write(vp->remove); if (vp->remove2) @@ -3093,6 +3096,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, vma->vm_pgoff = addr >> PAGE_SHIFT; vm_flags_init(vma, flags); vma->vm_page_prot = vm_get_page_prot(flags); + vma_start_write(vma); if (vma_iter_store_gfp(vmi, vma, GFP_KERNEL)) goto mas_store_fail; @@ -3341,7 +3345,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_start_write(new_vma); if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; From c9d6e982c3f8703c24f488d3de15e0ee97f4655e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 4 Aug 2023 08:27:24 -0700 Subject: [PATCH 330/489] mm: move vma locking out of vma_prepare and dup_anon_vma vma_prepare() is currently the central place where vmas are being locked before vma_complete() applies changes to them. While this is convenient, it also obscures vma locking and makes it harder to follow the locking rules. Move vma locking out of vma_prepare() and take vma locks explicitly at the locations where vmas are being modified. Move vma locking and replace it with an assertion inside dup_anon_vma() to further clarify the locking pattern inside vma_merge(). Link: https://lkml.kernel.org/r/20230804152724.3090321-7-surenb@google.com Suggested-by: Linus Torvalds Suggested-by: Liam R. Howlett Signed-off-by: Suren Baghdasaryan Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/mmap.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index ef584aca1cd32d..514ced13c65c7c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -462,16 +462,6 @@ static inline void init_vma_prep(struct vma_prepare *vp, */ static inline void vma_prepare(struct vma_prepare *vp) { - vma_start_write(vp->vma); - if (vp->adj_next) - vma_start_write(vp->adj_next); - if (vp->insert) - vma_start_write(vp->insert); - if (vp->remove) - vma_start_write(vp->remove); - if (vp->remove2) - vma_start_write(vp->remove2); - if (vp->file) { uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); @@ -605,7 +595,7 @@ static inline int dup_anon_vma(struct vm_area_struct *dst, * anon pages imported. */ if (src->anon_vma && !dst->anon_vma) { - vma_start_write(dst); + vma_assert_write_locked(dst); dst->anon_vma = src->anon_vma; return anon_vma_clone(dst, src); } @@ -637,10 +627,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, bool remove_next = false; struct vma_prepare vp; + vma_start_write(vma); if (next && (vma != next) && (end == next->vm_end)) { int ret; remove_next = true; + vma_start_write(next); ret = dup_anon_vma(vma, next); if (ret) return ret; @@ -696,6 +688,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi, NULL)) return -ENOMEM; + vma_start_write(vma); + init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, start, end, 0); @@ -921,16 +915,21 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, if (!merge_prev && !merge_next) return NULL; /* Not mergeable. */ + if (merge_prev) + vma_start_write(prev); + res = vma = prev; remove = remove2 = adjust = NULL; /* Can we merge both the predecessor and the successor? */ if (merge_prev && merge_next && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { + vma_start_write(next); remove = next; /* case 1 */ vma_end = next->vm_end; err = dup_anon_vma(prev, next); if (curr) { /* case 6 */ + vma_start_write(curr); remove = curr; remove2 = next; if (!next->anon_vma) @@ -938,6 +937,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } else if (merge_prev) { /* case 2 */ if (curr) { + vma_start_write(curr); err = dup_anon_vma(prev, curr); if (end == curr->vm_end) { /* case 7 */ remove = curr; @@ -947,8 +947,10 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, } } } else { /* merge_next */ + vma_start_write(next); res = next; if (prev && addr < prev->vm_end) { /* case 4 */ + vma_start_write(prev); vma_end = addr; adjust = next; adj_start = -(prev->vm_end - addr); @@ -964,6 +966,7 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, vma_pgoff = next->vm_pgoff - pglen; if (curr) { /* case 8 */ vma_pgoff = curr->vm_pgoff; + vma_start_write(curr); remove = curr; err = dup_anon_vma(next, curr); } @@ -2366,6 +2369,9 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); + vma_start_write(vma); + vma_start_write(new); + init_vma_prep(&vp, vma); vp.insert = new; vma_prepare(&vp); @@ -3071,6 +3077,8 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, if (vma_iter_prealloc(vmi, vma)) goto unacct_fail; + vma_start_write(vma); + init_vma_prep(&vp, vma); vma_prepare(&vp); vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); From 9a9d0b829901125553c36b9512b2a5da4505be31 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Mon, 7 Aug 2023 01:16:11 +0200 Subject: [PATCH 331/489] mm: move dummy_vm_ops out of a header Otherwise the kernel ends up with multiple copies: $ nm vmlinux | grep dummy_vm_ops ffffffff81e4ea00 d dummy_vm_ops.2 ffffffff81e11760 d dummy_vm_ops.254 ffffffff81e406e0 d dummy_vm_ops.4 ffffffff81e3c780 d dummy_vm_ops.7 While here prefix it with vma_. Link: https://lkml.kernel.org/r/20230806231611.1395735-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- mm/init-mm.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5a6ff914009004..c63ec57a54dc4d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -751,17 +751,17 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #endif /* CONFIG_PER_VMA_LOCK */ +extern const struct vm_operations_struct vma_dummy_vm_ops; + /* * WARNING: vma_init does not initialize vma->vm_lock. * Use vm_area_alloc()/vm_area_free() if vma needs locking. */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { - static const struct vm_operations_struct dummy_vm_ops = {}; - memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; - vma->vm_ops = &dummy_vm_ops; + vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); diff --git a/mm/init-mm.c b/mm/init-mm.c index efa97b57acfd88..cfd367822cdd2e 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -17,6 +17,8 @@ #define INIT_MM_CONTEXT(name) #endif +const struct vm_operations_struct vma_dummy_vm_ops; + /* * For dynamically allocated mm_structs, there is a dynamically sized cpumask * at the end of the structure, the size of which depends on the maximum CPU From 6379693e3c2683a7c86f395e878534731ac7ed06 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 7 Aug 2023 19:41:25 +0800 Subject: [PATCH 332/489] mm: memory-failure: use helper macro llist_for_each_entry_safe() It's more convenient to use helper macro llist_for_each_entry_safe(). No functional change intended. Link: https://lkml.kernel.org/r/20230807114125.3440802-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index adb0dacbc74e5c..976747d28ce7f2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1868,13 +1868,12 @@ bool is_raw_hwpoison_page_in_hugepage(struct page *page) static unsigned long __folio_free_raw_hwp(struct folio *folio, bool move_flag) { - struct llist_node *t, *tnode, *head; + struct llist_node *head; + struct raw_hwp_page *p, *next; unsigned long count = 0; head = llist_del_all(raw_hwp_list_head(folio)); - llist_for_each_safe(tnode, t, head) { - struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - + llist_for_each_entry_safe(p, next, head, node) { if (move_flag) SetPageHWPoison(p->page); else @@ -1889,7 +1888,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) { struct llist_head *head; struct raw_hwp_page *raw_hwp; - struct llist_node *t, *tnode; + struct raw_hwp_page *p, *next; int ret = folio_test_set_hwpoison(folio) ? -EHWPOISON : 0; /* @@ -1900,9 +1899,7 @@ static int folio_set_hugetlb_hwpoison(struct folio *folio, struct page *page) if (folio_test_hugetlb_raw_hwp_unreliable(folio)) return -EHWPOISON; head = raw_hwp_list_head(folio); - llist_for_each_safe(tnode, t, head->first) { - struct raw_hwp_page *p = container_of(tnode, struct raw_hwp_page, node); - + llist_for_each_entry_safe(p, next, head->first, node) { if (p->page == page) return -EHWPOISON; } From daee07bfba3340b07edcf9ae92044398e8a964db Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Mon, 7 Aug 2023 10:35:28 +0800 Subject: [PATCH 333/489] mm/mm_init: use helper macro BITS_PER_LONG and BITS_PER_BYTE It's more readable to use helper macro BITS_PER_LONG and BITS_PER_BYTE. No functional change intended. Link: https://lkml.kernel.org/r/20230807023528.325191-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mm_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mm_init.c b/mm/mm_init.c index 2a19f3151661a0..50f2f34745afa9 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -79,7 +79,7 @@ void __init mminit_verify_pageflags_layout(void) int shift, width; unsigned long or_mask, add_mask; - shift = 8 * sizeof(unsigned long); + shift = BITS_PER_LONG; width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", @@ -1426,9 +1426,9 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l usemapsize = roundup(zonesize, pageblock_nr_pages); usemapsize = usemapsize >> pageblock_order; usemapsize *= NR_PAGEBLOCK_BITS; - usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long)); + usemapsize = roundup(usemapsize, BITS_PER_LONG); - return usemapsize / 8; + return usemapsize / BITS_PER_BYTE; } static void __ref setup_usemap(struct zone *zone) From 3f32c49ed6f15c8412a8abc93a92c4b37e6c4592 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 8 Aug 2023 11:33:59 +0800 Subject: [PATCH 334/489] mm: memtest: convert to memtest_report_meminfo() It is better to not expose too many internal variables of memtest, add a helper memtest_report_meminfo() to show memtest results. Link: https://lkml.kernel.org/r/20230808033359.174986-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport (IBM) Cc: Matthew Wilcox Cc: Tomas Mudrunka Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 12 +----------- include/linux/memblock.h | 10 ++++------ mm/memtest.c | 22 ++++++++++++++++++++-- 3 files changed, 25 insertions(+), 19 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 74e3c3815696a6..45af9a989d4040 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -133,17 +133,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); -#ifdef CONFIG_MEMTEST - if (early_memtest_done) { - unsigned long early_memtest_bad_size_kb; - - early_memtest_bad_size_kb = early_memtest_bad_size>>10; - if (early_memtest_bad_size && !early_memtest_bad_size_kb) - early_memtest_bad_size_kb = 1; - /* When 0 is reported, it means there actually was a successful test */ - seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); - } -#endif + memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 0d031fbfea2537..1c1072e3ca0635 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -594,13 +594,11 @@ extern int hashdist; /* Distribute hashes across NUMA nodes? */ #endif #ifdef CONFIG_MEMTEST -extern phys_addr_t early_memtest_bad_size; /* Size of faulty ram found by memtest */ -extern bool early_memtest_done; /* Was early memtest done? */ -extern void early_memtest(phys_addr_t start, phys_addr_t end); +void early_memtest(phys_addr_t start, phys_addr_t end); +void memtest_report_meminfo(struct seq_file *m); #else -static inline void early_memtest(phys_addr_t start, phys_addr_t end) -{ -} +static inline void early_memtest(phys_addr_t start, phys_addr_t end) { } +static inline void memtest_report_meminfo(struct seq_file *m) { } #endif diff --git a/mm/memtest.c b/mm/memtest.c index 57149dfee43855..32f3e9dda8370f 100644 --- a/mm/memtest.c +++ b/mm/memtest.c @@ -3,9 +3,10 @@ #include #include #include +#include -bool early_memtest_done; -phys_addr_t early_memtest_bad_size; +static bool early_memtest_done; +static phys_addr_t early_memtest_bad_size; static u64 patterns[] __initdata = { /* The first entry has to be 0 to leave memtest with zeroed memory */ @@ -117,3 +118,20 @@ void __init early_memtest(phys_addr_t start, phys_addr_t end) do_one_pass(patterns[idx], start, end); } } + +void memtest_report_meminfo(struct seq_file *m) +{ + unsigned long early_memtest_bad_size_kb; + + if (!IS_ENABLED(CONFIG_PROC_FS)) + return; + + if (!early_memtest_done) + return; + + early_memtest_bad_size_kb = early_memtest_bad_size >> 10; + if (early_memtest_bad_size && !early_memtest_bad_size_kb) + early_memtest_bad_size_kb = 1; + /* When 0 is reported, it means there actually was a successful test */ + seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); +} From 97157d8908bc10dec22cc4479f4c5cc7db58a12c Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 8 Aug 2023 06:20:56 +0000 Subject: [PATCH 335/489] mm: zswap: update comment for struct zswap_entry Since commit 0bb488498c98 ("mm: zswap: remove zswap_header"), the 'offset' has been replaced by swpentry, update the comment for it, and also add comment for 'objcg'. Link: https://lkml.kernel.org/r/20230808062056.292950-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Reviewed-by: Yosry Ahmed Acked-by: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index 8b6b1bc8a5f2f2..7300b98d4a03bd 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -182,7 +182,7 @@ struct zswap_pool { * page within zswap. * * rbnode - links the entry into red-black tree for the appropriate swap type - * offset - the swap offset for the entry. Index into the red-black tree. + * swpentry - associated swap entry, the offset indexes into the red-black tree * refcount - the number of outstanding reference to the entry. This is needed * to protect against premature freeing of the entry by code * concurrent calls to load, invalidate, and writeback. The lock @@ -195,6 +195,7 @@ struct zswap_pool { * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data * value - value of the same-value filled pages which have same content + * objcg - the obj_cgroup that the compressed memory is charged to * lru - handle to the pool's lru used to evict pages. */ struct zswap_entry { From 9af7c7426c2e49bad77cf7494fea85a773d1ded6 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Tue, 8 Aug 2023 16:44:32 +0800 Subject: [PATCH 336/489] writeback: remove redundant checks for root memcg The check for root memcg will be done in wb_get_lookup(), so remove the redundant one to simplify the code. Link: https://lkml.kernel.org/r/20230808084431.1632934-1-alexjlzheng@tencent.com Signed-off-by: Jinliang Zheng Signed-off-by: Andrew Morton --- mm/backing-dev.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index fc44bfbf785ed2..1e3447bccdb14d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -732,9 +732,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, might_alloc(gfp); - if (!memcg_css->parent) - return &bdi->wb; - do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); From 04d5ea46a15149a12f79c686b6a1ffc9c3233272 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:56 +0530 Subject: [PATCH 337/489] mm/memory_hotplug: simplify ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE kconfig Patch series "Add support for memmap on memory feature on ppc64", v8. This patch series update memmap on memory feature to fall back to memmap allocation outside the memory block if the alignment rules are not met. This makes the feature more useful on architectures like ppc64 where alignment rules are different with 64K page size. This patch (of 6): Instead of adding menu entry with all supported architectures, add mm/Kconfig variable and select the same from supported architectures. No functional change in this patch. Link: https://lkml.kernel.org/r/20230808091501.287660-1-aneesh.kumar@linux.ibm.com Link: https://lkml.kernel.org/r/20230808091501.287660-2-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/arm64/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- mm/Kconfig | 3 +++ 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 751d8c8821dbe0..a7a88322bf46af 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -78,6 +78,7 @@ config ARM64 select ARCH_INLINE_SPIN_UNLOCK_IRQ if !PREEMPTION select ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE if !PREEMPTION select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_GNU_PROPERTY select ARCH_USE_MEMTEST @@ -349,9 +350,6 @@ config GENERIC_CSUM config GENERIC_CALIBRATE_DELAY def_bool y -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - config SMP def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 78224aa7640986..d0258e92a8afd8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -102,6 +102,7 @@ config X86 select ARCH_HAS_DEBUG_WX select ARCH_HAS_ZONE_DMA_SET if EXPERT select ARCH_HAVE_NMI_SAFE_CMPXCHG + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO @@ -2610,9 +2611,6 @@ config ARCH_HAS_ADD_PAGES def_bool y depends on ARCH_ENABLE_MEMORY_HOTPLUG -config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE - def_bool y - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/mm/Kconfig b/mm/Kconfig index 5fe49c030961ec..721dc88423c7e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -571,6 +571,9 @@ config MHP_MEMMAP_ON_MEMORY endif # MEMORY_HOTPLUG +config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE + bool + # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address # space can be handled with less contention: split it at this NR_CPUS. From e3c2bfdd33a30b34674fb8839f5476ab2702c1c1 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:57 +0530 Subject: [PATCH 338/489] mm/memory_hotplug: allow memmap on memory hotplug request to fallback If not supported, fallback to not using memap on memmory. This avoids the need for callers to do the fallback. Link: https://lkml.kernel.org/r/20230808091501.287660-3-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/acpi/acpi_memhotplug.c | 3 +-- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 13 ++++++------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 24f662d8bd39a6..d0c1a71007d0a3 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -211,8 +211,7 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) if (!info->length) continue; - if (mhp_supports_memmap_on_memory(info->length)) - mhp_flags |= MHP_MEMMAP_ON_MEMORY; + mhp_flags |= MHP_MEMMAP_ON_MEMORY; result = __add_memory(mgid, info->start_addr, info->length, mhp_flags); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 013c69753c9173..7d207658349416 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -97,6 +97,8 @@ typedef int __bitwise mhp_t; * To do so, we will use the beginning of the hot-added range to build * the page tables for the memmap array that describes the entire range. * Only selected architectures support it with SPARSE_VMEMMAP. + * This is only a hint, the core kernel can decide to not do this based on + * different alignment checks. */ #define MHP_MEMMAP_ON_MEMORY ((__force mhp_t)BIT(1)) /* @@ -354,7 +356,6 @@ extern struct zone *zone_for_pfn_range(int online_type, int nid, extern int arch_create_linear_mapping(int nid, u64 start, u64 size, struct mhp_params *params); void arch_remove_linear_mapping(u64 start, u64 size); -extern bool mhp_supports_memmap_on_memory(unsigned long size); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __LINUX_MEMORY_HOTPLUG_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7cfd13c91568a2..eca32ccd45ccdf 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1247,7 +1247,7 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -bool mhp_supports_memmap_on_memory(unsigned long size) +static bool mhp_supports_memmap_on_memory(unsigned long size) { unsigned long nr_vmemmap_pages = size / PAGE_SIZE; unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); @@ -1339,13 +1339,12 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) * Self hosted memmap array */ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { - if (!mhp_supports_memmap_on_memory(size)) { - ret = -EINVAL; - goto error; + if (mhp_supports_memmap_on_memory(size)) { + mhp_altmap.free = PHYS_PFN(size); + mhp_altmap.base_pfn = PHYS_PFN(start); + params.altmap = &mhp_altmap; } - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); - params.altmap = &mhp_altmap; + /* fallback to not using altmap */ } /* call arch's memory hotadd */ From 85a2b4b08f202d67be81e2453064e01572ec19c8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:58 +0530 Subject: [PATCH 339/489] mm/memory_hotplug: allow architecture to override memmap on memory support check Some architectures would want different restrictions. Hence add an architecture-specific override. The PMD_SIZE check is moved there. Link: https://lkml.kernel.org/r/20230808091501.287660-4-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- mm/memory_hotplug.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index eca32ccd45ccdf..746cb7c08c64f7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1247,10 +1247,26 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } +static inline unsigned long memory_block_memmap_size(void) +{ + return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); +} + +#ifndef arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) +{ + /* + * As default, we want the vmemmap to span a complete PMD such that we + * can map the vmemmap using a single PMD if supported by the + * architecture. + */ + return IS_ALIGNED(vmemmap_size, PMD_SIZE); +} +#endif + static bool mhp_supports_memmap_on_memory(unsigned long size) { - unsigned long nr_vmemmap_pages = size / PAGE_SIZE; - unsigned long vmemmap_size = nr_vmemmap_pages * sizeof(struct page); + unsigned long vmemmap_size = memory_block_memmap_size(); unsigned long remaining_size = size - vmemmap_size; /* @@ -1281,8 +1297,8 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) */ return mhp_memmap_on_memory() && size == memory_block_size_bytes() && - IS_ALIGNED(vmemmap_size, PMD_SIZE) && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)); + IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && + arch_supports_memmap_on_memory(vmemmap_size); } /* From 2d1f649c7c0855751c7ff43f4e34784061bc72f7 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:44:59 +0530 Subject: [PATCH 340/489] mm/memory_hotplug: support memmap_on_memory when memmap is not aligned to pageblocks Currently, memmap_on_memory feature is only supported with memory block sizes that result in vmemmap pages covering full page blocks. This is because memory onlining/offlining code requires applicable ranges to be pageblock-aligned, for example, to set the migratetypes properly. This patch helps to lift that restriction by reserving more pages than required for vmemmap space. This helps the start address to be page block aligned with different memory block sizes. Using this facility implies the kernel will be reserving some pages for every memoryblock. This allows the memmap on memory feature to be widely useful with different memory block size values. For ex: with 64K page size and 256MiB memory block size, we require 4 pages to map vmemmap pages, To align things correctly we end up adding a reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved. Link: https://lkml.kernel.org/r/20230808091501.287660-5-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- .../admin-guide/mm/memory-hotplug.rst | 12 ++ mm/memory_hotplug.c | 120 +++++++++++++++--- 2 files changed, 113 insertions(+), 19 deletions(-) diff --git a/Documentation/admin-guide/mm/memory-hotplug.rst b/Documentation/admin-guide/mm/memory-hotplug.rst index bd77841041af04..2994958c7ce854 100644 --- a/Documentation/admin-guide/mm/memory-hotplug.rst +++ b/Documentation/admin-guide/mm/memory-hotplug.rst @@ -433,6 +433,18 @@ The following module parameters are currently defined: memory in a way that huge pages in bigger granularity cannot be formed on hotplugged memory. + + With value "force" it could result in memory + wastage due to memmap size limitations. For + example, if the memmap for a memory block + requires 1 MiB, but the pageblock size is 2 + MiB, 1 MiB of hotplugged memory will be wasted. + Note that there are still cases where the + feature cannot be enforced: for example, if the + memmap is smaller than a single page, or if the + architecture does not support the forced mode + in all configurations. + ``online_policy`` read-write: Set the basic policy used for automatic zone selection when onlining memory blocks without specifying a target zone. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 746cb7c08c64f7..76b813991bdc4b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -41,17 +41,83 @@ #include "internal.h" #include "shuffle.h" +enum { + MEMMAP_ON_MEMORY_DISABLE = 0, + MEMMAP_ON_MEMORY_ENABLE, + MEMMAP_ON_MEMORY_FORCE, +}; + +static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; + +static inline unsigned long memory_block_memmap_size(void) +{ + return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); +} + +static inline unsigned long memory_block_memmap_on_memory_pages(void) +{ + unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); + + /* + * In "forced" memmap_on_memory mode, we add extra pages to align the + * vmemmap size to cover full pageblocks. That way, we can add memory + * even if the vmemmap size is not properly aligned, however, we might waste + * memory. + */ + if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) + return pageblock_align(nr_pages); + return nr_pages; +} + #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY /* * memory_hotplug.memmap_on_memory parameter */ -static bool memmap_on_memory __ro_after_init; -module_param(memmap_on_memory, bool, 0444); -MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); +static int set_memmap_mode(const char *val, const struct kernel_param *kp) +{ + int ret, mode; + bool enabled; + + if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { + mode = MEMMAP_ON_MEMORY_FORCE; + } else { + ret = kstrtobool(val, &enabled); + if (ret < 0) + return ret; + if (enabled) + mode = MEMMAP_ON_MEMORY_ENABLE; + else + mode = MEMMAP_ON_MEMORY_DISABLE; + } + *((int *)kp->arg) = mode; + if (mode == MEMMAP_ON_MEMORY_FORCE) { + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); + + pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", + memmap_pages - PFN_UP(memory_block_memmap_size())); + } + return 0; +} + +static int get_memmap_mode(char *buffer, const struct kernel_param *kp) +{ + if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE) + return sprintf(buffer, "force\n"); + return param_get_bool(buffer, kp); +} + +static const struct kernel_param_ops memmap_mode_ops = { + .set = set_memmap_mode, + .get = get_memmap_mode, +}; +module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); +MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" + "With value \"force\" it could result in memory wastage due " + "to memmap size limitations (Y/N/force)"); static inline bool mhp_memmap_on_memory(void) { - return memmap_on_memory; + return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; } #else static inline bool mhp_memmap_on_memory(void) @@ -1247,11 +1313,6 @@ static int online_memory_block(struct memory_block *mem, void *arg) return device_online(&mem->dev); } -static inline unsigned long memory_block_memmap_size(void) -{ - return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); -} - #ifndef arch_supports_memmap_on_memory static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) { @@ -1267,7 +1328,7 @@ static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) static bool mhp_supports_memmap_on_memory(unsigned long size) { unsigned long vmemmap_size = memory_block_memmap_size(); - unsigned long remaining_size = size - vmemmap_size; + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); /* * Besides having arch support and the feature enabled at runtime, we @@ -1295,10 +1356,28 @@ static bool mhp_supports_memmap_on_memory(unsigned long size) * altmap as an alternative source of memory, and we do not exactly * populate a single PMD. */ - return mhp_memmap_on_memory() && - size == memory_block_size_bytes() && - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && - arch_supports_memmap_on_memory(vmemmap_size); + if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) + return false; + + /* + * Make sure the vmemmap allocation is fully contained + * so that we always allocate vmemmap memory from altmap area. + */ + if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) + return false; + + /* + * start pfn should be pageblock_nr_pages aligned for correctly + * setting migrate types + */ + if (!pageblock_aligned(memmap_pages)) + return false; + + if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) + /* No effective hotplugged memory doesn't make sense. */ + return false; + + return arch_supports_memmap_on_memory(vmemmap_size); } /* @@ -1311,7 +1390,10 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) { struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; enum memblock_flags memblock_flags = MEMBLOCK_NONE; - struct vmem_altmap mhp_altmap = {}; + struct vmem_altmap mhp_altmap = { + .base_pfn = PHYS_PFN(res->start), + .end_pfn = PHYS_PFN(res->end), + }; struct memory_group *group = NULL; u64 start, size; bool new_node = false; @@ -1356,8 +1438,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) */ if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { if (mhp_supports_memmap_on_memory(size)) { - mhp_altmap.free = PHYS_PFN(size); - mhp_altmap.base_pfn = PHYS_PFN(start); + mhp_altmap.free = memory_block_memmap_on_memory_pages(); params.altmap = &mhp_altmap; } /* fallback to not using altmap */ @@ -1369,8 +1450,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) goto error; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.alloc, - group); + ret = create_memory_block_devices(start, size, mhp_altmap.free, group); if (ret) { arch_remove_memory(start, size, NULL); goto error; @@ -2096,6 +2176,8 @@ static int __ref try_remove_memory(u64 start, u64 size) * right thing if we used vmem_altmap when hot-adding * the range. */ + mhp_altmap.base_pfn = PHYS_PFN(start); + mhp_altmap.free = nr_vmemmap_pages; mhp_altmap.alloc = nr_vmemmap_pages; altmap = &mhp_altmap; } From 603fd64dfa45d1e9df996911e4010f2b00731387 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:45:00 +0530 Subject: [PATCH 341/489] powerpc/book3s64/memhotplug: enable memmap on memory for radix Radix vmemmap mapping can map things correctly at the PMD level or PTE level based on different device boundary checks. Hence we skip the restrictions w.r.t vmemmap size to be multiple of PMD_SIZE. This also makes the feature widely useful because to use PMD_SIZE vmemmap area we require a memory block size of 2GiB We can also use MHP_RESERVE_PAGES_MEMMAP_ON_MEMORY to that the feature can work with a memory block size of 256MB. Using altmap.reserve feature to align things correctly at pageblock granularity. We can end up losing some pages in memory with this. For ex: with a 256MiB memory block size, we require 4 pages to map vmemmap pages, In order to align things correctly we end up adding a reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved. Link: https://lkml.kernel.org/r/20230808091501.287660-6-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Reviewed-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Michal Hocko Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/pgtable.h | 21 +++++++++++++++++++ .../platforms/pseries/hotplug-memory.c | 2 +- 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d0497d13f5b494..938294c996dca3 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -157,6 +157,7 @@ config PPC select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG select ARCH_KEEP_MEMBLOCK + select ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE if PPC_RADIX_MMU select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index a4893b17705a26..33464e6d64315a 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -161,6 +161,27 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) int __meminit vmemmap_populated(unsigned long vmemmap_addr, int vmemmap_map_size); bool altmap_cross_boundary(struct vmem_altmap *altmap, unsigned long start, unsigned long page_size); +/* + * mm/memory_hotplug.c:mhp_supports_memmap_on_memory goes into details + * some of the restrictions. We don't check for PMD_SIZE because our + * vmemmap allocation code can fallback correctly. The pageblock + * alignment requirement is met using altmap->reserve blocks. + */ +#define arch_supports_memmap_on_memory arch_supports_memmap_on_memory +static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) +{ + if (!radix_enabled()) + return false; + /* + * With 4K page size and 2M PMD_SIZE, we can align + * things better with memory block size value + * starting from 128MB. Hence align things with PMD_SIZE. + */ + if (IS_ENABLED(CONFIG_PPC_4K_PAGES)) + return IS_ALIGNED(vmemmap_size, PMD_SIZE); + return true; +} + #endif /* CONFIG_PPC64 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 9c62c2c3b3d0bb..4f3d6a2f9065b0 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -637,7 +637,7 @@ static int dlpar_add_lmb(struct drmem_lmb *lmb) nid = first_online_node; /* Add the memory */ - rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_NONE); + rc = __add_memory(nid, lmb->base_addr, block_sz, MHP_MEMMAP_ON_MEMORY); if (rc) { invalidate_lmb_associativity_index(lmb); return rc; From 1a8c64e110435e44e71bcd50a75663174b575f22 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 8 Aug 2023 14:45:01 +0530 Subject: [PATCH 342/489] mm/memory_hotplug: embed vmem_altmap details in memory block With memmap on memory, some architecture needs more details w.r.t altmap such as base_pfn, end_pfn, etc to unmap vmemmap memory. Instead of computing them again when we remove a memory block, embed vmem_altmap details in struct memory_block if we are using memmap on memory block feature. [yangyingliang@huawei.com: fix error return code in add_memory_resource()] Link: https://lkml.kernel.org/r/20230809081552.1351184-1-yangyingliang@huawei.com Link: https://lkml.kernel.org/r/20230808091501.287660-7-aneesh.kumar@linux.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Yang Yingliang Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Christophe Leroy Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Oscar Salvador Cc: Vishal Verma Signed-off-by: Andrew Morton --- drivers/base/memory.c | 27 ++++++++++++-------- include/linux/memory.h | 8 ++---- mm/memory_hotplug.c | 56 ++++++++++++++++++++++++++---------------- 3 files changed, 54 insertions(+), 37 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index b456ac21361058..8191709c9ad283 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -105,7 +105,8 @@ EXPORT_SYMBOL(unregister_memory_notifier); static void memory_block_release(struct device *dev) { struct memory_block *mem = to_memory_block(dev); - + /* Verify that the altmap is freed */ + WARN_ON(mem->altmap); kfree(mem); } @@ -183,7 +184,7 @@ static int memory_block_online(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + unsigned long nr_vmemmap_pages = 0; struct zone *zone; int ret; @@ -200,6 +201,9 @@ static int memory_block_online(struct memory_block *mem) * stage helps to keep accounting easier to follow - e.g vmemmaps * belong to the same zone as the memory they backed. */ + if (mem->altmap) + nr_vmemmap_pages = mem->altmap->free; + if (nr_vmemmap_pages) { ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages, zone); if (ret) @@ -230,7 +234,7 @@ static int memory_block_offline(struct memory_block *mem) { unsigned long start_pfn = section_nr_to_pfn(mem->start_section_nr); unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; - unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages; + unsigned long nr_vmemmap_pages = 0; int ret; if (!mem->zone) @@ -240,6 +244,9 @@ static int memory_block_offline(struct memory_block *mem) * Unaccount before offlining, such that unpopulated zone and kthreads * can properly be torn down in offline_pages(). */ + if (mem->altmap) + nr_vmemmap_pages = mem->altmap->free; + if (nr_vmemmap_pages) adjust_present_page_count(pfn_to_page(start_pfn), mem->group, -nr_vmemmap_pages); @@ -726,7 +733,7 @@ void memory_block_add_nid(struct memory_block *mem, int nid, #endif static int add_memory_block(unsigned long block_id, unsigned long state, - unsigned long nr_vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { struct memory_block *mem; @@ -744,7 +751,7 @@ static int add_memory_block(unsigned long block_id, unsigned long state, mem->start_section_nr = block_id * sections_per_block; mem->state = state; mem->nid = NUMA_NO_NODE; - mem->nr_vmemmap_pages = nr_vmemmap_pages; + mem->altmap = altmap; INIT_LIST_HEAD(&mem->group_next); #ifndef CONFIG_NUMA @@ -783,14 +790,14 @@ static int __init add_boot_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return add_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0, NULL); + MEM_ONLINE, NULL, NULL); } static int add_hotplug_memory_block(unsigned long block_id, - unsigned long nr_vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { - return add_memory_block(block_id, MEM_OFFLINE, nr_vmemmap_pages, group); + return add_memory_block(block_id, MEM_OFFLINE, altmap, group); } static void remove_memory_block(struct memory_block *memory) @@ -818,7 +825,7 @@ static void remove_memory_block(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); @@ -832,7 +839,7 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = add_hotplug_memory_block(block_id, vmemmap_pages, group); + ret = add_hotplug_memory_block(block_id, altmap, group); if (ret) break; } diff --git a/include/linux/memory.h b/include/linux/memory.h index 31343566c22126..f53cfdaaaa4166 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -77,11 +77,7 @@ struct memory_block { */ struct zone *zone; struct device dev; - /* - * Number of vmemmap pages. These pages - * lay at the beginning of the memory block. - */ - unsigned long nr_vmemmap_pages; + struct vmem_altmap *altmap; struct memory_group *group; /* group (if any) for this block */ struct list_head group_next; /* next block inside memory group */ #if defined(CONFIG_MEMORY_FAILURE) && defined(CONFIG_MEMORY_HOTPLUG) @@ -147,7 +143,7 @@ static inline int hotplug_memory_notifier(notifier_fn_t fn, int pri) extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages, + struct vmem_altmap *altmap, struct memory_group *group); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 76b813991bdc4b..1b03f4ec6fd21b 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1439,7 +1439,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { if (mhp_supports_memmap_on_memory(size)) { mhp_altmap.free = memory_block_memmap_on_memory_pages(); - params.altmap = &mhp_altmap; + params.altmap = kmalloc(sizeof(struct vmem_altmap), GFP_KERNEL); + if (!params.altmap) { + ret = -ENOMEM; + goto error; + } + + memcpy(params.altmap, &mhp_altmap, sizeof(mhp_altmap)); } /* fallback to not using altmap */ } @@ -1447,13 +1453,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) /* call arch's memory hotadd */ ret = arch_add_memory(nid, start, size, ¶ms); if (ret < 0) - goto error; + goto error_free; /* create memory block devices after memory was added */ - ret = create_memory_block_devices(start, size, mhp_altmap.free, group); + ret = create_memory_block_devices(start, size, params.altmap, group); if (ret) { arch_remove_memory(start, size, NULL); - goto error; + goto error_free; } if (new_node) { @@ -1490,6 +1496,8 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags) walk_memory_blocks(start, size, NULL, online_memory_block); return ret; +error_free: + kfree(params.altmap); error: if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) memblock_remove(start, size); @@ -2056,12 +2064,18 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) return 0; } -static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg) +static int test_has_altmap_cb(struct memory_block *mem, void *arg) { + struct memory_block **mem_ptr = (struct memory_block **)arg; /* - * If not set, continue with the next block. + * return the memblock if we have altmap + * and break callback. */ - return mem->nr_vmemmap_pages; + if (mem->altmap) { + *mem_ptr = mem; + return 1; + } + return 0; } static int check_cpu_on_node(int nid) @@ -2136,10 +2150,9 @@ EXPORT_SYMBOL(try_offline_node); static int __ref try_remove_memory(u64 start, u64 size) { - struct vmem_altmap mhp_altmap = {}; - struct vmem_altmap *altmap = NULL; - unsigned long nr_vmemmap_pages; + struct memory_block *mem; int rc = 0, nid = NUMA_NO_NODE; + struct vmem_altmap *altmap = NULL; BUG_ON(check_hotplug_memory_range(start, size)); @@ -2161,25 +2174,20 @@ static int __ref try_remove_memory(u64 start, u64 size) * the same granularity it was added - a single memory block. */ if (mhp_memmap_on_memory()) { - nr_vmemmap_pages = walk_memory_blocks(start, size, NULL, - get_nr_vmemmap_pages_cb); - if (nr_vmemmap_pages) { + rc = walk_memory_blocks(start, size, &mem, test_has_altmap_cb); + if (rc) { if (size != memory_block_size_bytes()) { pr_warn("Refuse to remove %#llx - %#llx," "wrong granularity\n", start, start + size); return -EINVAL; } - + altmap = mem->altmap; /* - * Let remove_pmd_table->free_hugepage_table do the - * right thing if we used vmem_altmap when hot-adding - * the range. + * Mark altmap NULL so that we can add a debug + * check on memblock free. */ - mhp_altmap.base_pfn = PHYS_PFN(start); - mhp_altmap.free = nr_vmemmap_pages; - mhp_altmap.alloc = nr_vmemmap_pages; - altmap = &mhp_altmap; + mem->altmap = NULL; } } @@ -2196,6 +2204,12 @@ static int __ref try_remove_memory(u64 start, u64 size) arch_remove_memory(start, size, altmap); + /* Verify that all vmemmap pages have actually been freed. */ + if (altmap) { + WARN(altmap->alloc, "Altmap not fully unmapped"); + kfree(altmap); + } + if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) { memblock_phys_free(start, size); memblock_remove(start, size); From f142b2c2530c1383a45e1ada1d641974b9723a35 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 9 Aug 2023 18:07:53 +0800 Subject: [PATCH 343/489] mm/page_alloc: remove track of active PCP lists range in bulk free Patch series "Two minor cleanups for pcp list in page_alloc". There are two minor cleanups for pcp list in page_alloc. More details can be found in respective patches. This patch (of 2): After commit fd56eef258a17 ("mm/page_alloc: simplify how many pages are selected per pcp list during bulk free"), we will drain all pages in selected pcp list. And we ensured passed count is < pcp->count. Then, the search will finish before wrap-around and track of active PCP lists range intended for wrap-around case is no longer needed. Link: https://lkml.kernel.org/r/20230809100754.3094517-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230809100754.3094517-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94f9e159a18df7..bc782bffaf0268 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1207,8 +1207,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, int pindex) { unsigned long flags; - int min_pindex = 0; - int max_pindex = NR_PCP_LISTS - 1; unsigned int order; bool isolated_pageblocks; struct page *page; @@ -1231,17 +1229,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* Remove pages from lists in a round-robin fashion. */ do { - if (++pindex > max_pindex) - pindex = min_pindex; + if (++pindex > NR_PCP_LISTS - 1) + pindex = 0; list = &pcp->lists[pindex]; - if (!list_empty(list)) - break; - - if (pindex == max_pindex) - max_pindex--; - if (pindex == min_pindex) - min_pindex++; - } while (1); + } while (list_empty(list)); order = pindex_to_order(pindex); nr_pages = 1 << order; From 1305870529d9e16170bb744148aab6dffb19bb23 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 9 Aug 2023 18:07:54 +0800 Subject: [PATCH 344/489] mm/page_alloc: remove unnecessary parameter batch of nr_pcp_free We get batch from pcp and just pass it to nr_pcp_free immediately. Get batch from pcp inside nr_pcp_free to remove unnecessary parameter batch. Link: https://lkml.kernel.org/r/20230809100754.3094517-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bc782bffaf0268..cfadde2fe3a2a3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2340,10 +2340,10 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, return true; } -static int nr_pcp_free(struct per_cpu_pages *pcp, int high, int batch, - bool free_high) +static int nr_pcp_free(struct per_cpu_pages *pcp, int high, bool free_high) { int min_nr_free, max_nr_free; + int batch = READ_ONCE(pcp->batch); /* Free everything if batch freeing high-order pages. */ if (unlikely(free_high)) @@ -2410,9 +2410,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, high = nr_pcp_high(pcp, zone, free_high); if (pcp->count >= high) { - int batch = READ_ONCE(pcp->batch); - - free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, pindex); + free_pcppages_bulk(zone, nr_pcp_free(pcp, high, free_high), pcp, pindex); } } From 8fbb92bd10be26d0feec6bc35332159145c27cc0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Wed, 9 Aug 2023 17:49:10 +0800 Subject: [PATCH 345/489] mm/compaction: remove unused parameter pgdata of fragmentation_score_wmark Parameter pgdat is not used in fragmentation_score_wmark. Just remove it. Link: https://lkml.kernel.org/r/20230809094910.3092446-1-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Reviewed-by: David Hildenbrand Acked-by: Mel Gorman Reviewed-by: Baolin Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/compaction.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index fe7b4e7c5d2401..216081ab325aaa 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2140,7 +2140,7 @@ static unsigned int fragmentation_score_node(pg_data_t *pgdat) return score; } -static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low) +static unsigned int fragmentation_score_wmark(bool low) { unsigned int wmark_low; @@ -2160,7 +2160,7 @@ static bool should_proactive_compact_node(pg_data_t *pgdat) if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat)) return false; - wmark_high = fragmentation_score_wmark(pgdat, false); + wmark_high = fragmentation_score_wmark(false); return fragmentation_score_node(pgdat) > wmark_high; } @@ -2199,7 +2199,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) return COMPACT_PARTIAL_SKIPPED; score = fragmentation_score_zone(cc->zone); - wmark_low = fragmentation_score_wmark(pgdat, true); + wmark_low = fragmentation_score_wmark(true); if (score > wmark_low) ret = COMPACT_CONTINUE; From b7108d66318abf3e060c7839eabcba52e9461568 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Wed, 9 Aug 2023 13:35:44 +0530 Subject: [PATCH 346/489] Multi-gen LRU: skip CMA pages when they are not eligible This patch is based on the commit 5da226dbfce3("mm: skip CMA pages when they are not available") which skips cma pages reclaim when they are not eligible for the current allocation context. In mglru, such pages are added to the tail of the immediate generation to maintain better LRU order, which is unlike the case of conventional LRU where such pages are directly added to the head of the LRU list(akin to adding to head of the youngest generation in mglru). No observable issue without this patch on MGLRU, but logically it make sense to skip the CMA page reclaim when those pages can't be satisfied for the current allocation context. Link: https://lkml.kernel.org/r/1691568344-13475-1-git-send-email-quic_charante@quicinc.com Fixes: ac35a4902374 ("mm: multi-gen LRU: minimal implementation") Signed-off-by: Charan Teja Kalla Reviewed-by: Kalesh Singh Cc: David Hildenbrand Cc: Suren Baghdasaryan Cc: Yu Zhao Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b4329f93a682e3..6cbe921ef66297 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4943,7 +4943,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* ineligible */ - if (zone > sc->reclaim_idx) { + if (zone > sc->reclaim_idx || skip_cma(folio, sc)) { gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); return true; From 368d983b985572e33422432d849f5956268bce21 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Wed, 9 Aug 2023 15:33:23 +0800 Subject: [PATCH 347/489] mm: page_alloc: remove unused parameter from reserve_highatomic_pageblock() Just remove the redundant parameter alloc_order from reserve_highatomic_pageblock(). No functional modification involved. Link: https://lkml.kernel.org/r/20230809073323.1065286-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Cc: Kefeng Wang Cc: Mel Gorman Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cfadde2fe3a2a3..a5d14ca13946dd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1899,8 +1899,7 @@ int find_suitable_fallback(struct free_area *area, unsigned int order, * Reserve a pageblock for exclusive use of high-order atomic allocations if * there are no empty page blocks that contain a page with a suitable order */ -static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, - unsigned int alloc_order) +static void reserve_highatomic_pageblock(struct page *page, struct zone *zone) { int mt; unsigned long max_managed, flags; @@ -3210,7 +3209,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, * if the pageblock should be reserved for the future */ if (unlikely(alloc_flags & ALLOC_HIGHATOMIC)) - reserve_highatomic_pageblock(page, zone, order); + reserve_highatomic_pageblock(page, zone); return page; } else { From a04d12c2481fbf2752b5686d8a8049dd59e61e37 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 11 Aug 2023 19:59:44 +0800 Subject: [PATCH 348/489] mm/page_alloc: remove unnecessary inner __get_pfnblock_flags_mask Patch series "Two minor cleanups for get pageblock migratetype". This series contains two minor cleanups for get pageblock migratetype. More details can be found in respective patches. This patch (of 2): get_pfnblock_flags_mask() just calls inline inner __get_pfnblock_flags_mask without any extra work. Just opencode __get_pfnblock_flags_mask in get_pfnblock_flags_mask and replace call to __get_pfnblock_flags_mask with call to get_pfnblock_flags_mask to remove unnecessary __get_pfnblock_flags_mask. Link: https://lkml.kernel.org/r/20230811115945.3423894-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20230811115945.3423894-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Mel Gorman Reviewed-by: Matthew Wilcox (Oracle) Cc: Baolin Wang Cc: David Hildenbrand Signed-off-by: Andrew Morton --- mm/page_alloc.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a5d14ca13946dd..5e67fe937e1952 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -371,10 +371,16 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } -static __always_inline -unsigned long __get_pfnblock_flags_mask(const struct page *page, - unsigned long pfn, - unsigned long mask) +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +unsigned long get_pfnblock_flags_mask(const struct page *page, + unsigned long pfn, unsigned long mask) { unsigned long *bitmap; unsigned long bitidx, word_bitidx; @@ -393,24 +399,10 @@ unsigned long __get_pfnblock_flags_mask(const struct page *page, return (word >> bitidx) & mask; } -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(const struct page *page, - unsigned long pfn, unsigned long mask) -{ - return __get_pfnblock_flags_mask(page, pfn, mask); -} - static __always_inline int get_pfnblock_migratetype(const struct page *page, unsigned long pfn) { - return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); + return get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); } /** From b5ffd2973365298c4d829802653133038837a6f9 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Fri, 11 Aug 2023 19:59:45 +0800 Subject: [PATCH 349/489] mm/page_alloc: use get_pfnblock_migratetype to avoid extra page_to_pfn We have get_pageblock_migratetype and get_pfnblock_migratetype to get migratetype of page. get_pfnblock_migratetype accepts both page and pfn from caller while get_pageblock_migratetype only accept page and get pfn with page_to_pfn from page. In case we already record pfn of page, we can simply call get_pfnblock_migratetype to avoid a page_to_pfn. Link: https://lkml.kernel.org/r/20230811115945.3423894-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Mel Gorman Cc: Baolin Wang Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5e67fe937e1952..986b56db96b5bf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -813,7 +813,7 @@ static inline void __free_one_page(struct page *page, * pageblock isolation could cause incorrect freepage or CMA * accounting or HIGHATOMIC accounting. */ - int buddy_mt = get_pageblock_migratetype(buddy); + int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn); if (migratetype != buddy_mt && (!migratetype_is_mergeable(migratetype) || @@ -889,7 +889,7 @@ int split_free_page(struct page *free_page, goto out; } - mt = get_pageblock_migratetype(free_page); + mt = get_pfnblock_migratetype(free_page, free_page_pfn); if (likely(!is_migrate_isolate(mt))) __mod_zone_freepage_state(zone, -(1UL << order), mt); From e1dea6d3c68113ac5d15a762e0c93e811e569739 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Mon, 14 Aug 2023 22:11:42 +0000 Subject: [PATCH 350/489] mm/z3fold: remove obsolete comment for struct z3fold_pool Since commit e774a7bc7f0a ("mm: zswap: remove page reclaim logic from z3fold"), zpool and zpool_ops have been removed, so also remove the corresponding comments. Link: https://lkml.kernel.org/r/20230814221142.486548-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Reviewed-by: Miaohe Lin Cc: Vitaly Wool Signed-off-by: Andrew Morton --- mm/z3fold.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 7952adf9bede6f..7c76b396b74cfd 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -133,8 +133,6 @@ struct z3fold_header { * @stale: list of pages marked for freeing * @pages_nr: number of z3fold pages in the pool. * @c_handle: cache for z3fold_buddy_slots allocation - * @zpool: zpool driver - * @zpool_ops: zpool operations structure with an evict callback * @compact_wq: workqueue for page layout background optimization * @release_wq: workqueue for safe page release * @work: work_struct for safe page release From f7bda0d85dd7733143b5ea66987cb9b102bd0189 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:43 -0700 Subject: [PATCH 351/489] mm: add PAGE_TYPE_OP folio functions Patch series "Split ptdesc from struct page", v9. The MM subsystem is trying to shrink struct page. This patchset introduces a memory descriptor for page table tracking - struct ptdesc. This patchset introduces ptdesc, splits ptdesc from struct page, and converts many callers of page table constructor/destructors to use ptdescs. Ptdesc is a foundation to further standardize page tables, and eventually allow for dynamic allocation of page tables independent of struct page. However, the use of pages for page table tracking is quite deeply ingrained and varied across archictectures, so there is still a lot of work to be done before that can happen. This patch (of 31): No folio equivalents for page type operations have been defined, so define them for later folio conversions. Also changes the Page##uname macros to take in const struct page* since we only read the memory here. Link: https://lkml.kernel.org/r/20230807230513.102486-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20230807230513.102486-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Huacai Chen Cc: Hugh Dickins Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Cc: Geert Uytterhoeven Cc: Guo Ren Cc: John Paul Adrian Glaubitz Cc: Palmer Dabbelt Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 92a2063a0a2323..9218028caf337e 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -908,6 +908,8 @@ static inline bool is_page_hwpoison(struct page *page) #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) +#define folio_test_type(folio, flag) \ + ((folio->page.page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_type_has_type(unsigned int page_type) { @@ -919,27 +921,41 @@ static inline int page_has_type(struct page *page) return page_type_has_type(page->page_type); } -#define PAGE_TYPE_OPS(uname, lname) \ -static __always_inline int Page##uname(struct page *page) \ +#define PAGE_TYPE_OPS(uname, lname, fname) \ +static __always_inline int Page##uname(const struct page *page) \ { \ return PageType(page, PG_##lname); \ } \ +static __always_inline int folio_test_##fname(const struct folio *folio)\ +{ \ + return folio_test_type(folio, PG_##lname); \ +} \ static __always_inline void __SetPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ +static __always_inline void __folio_set_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_type(folio, 0), folio); \ + folio->page.page_type &= ~PG_##lname; \ +} \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ -} +} \ +static __always_inline void __folio_clear_##fname(struct folio *folio) \ +{ \ + VM_BUG_ON_FOLIO(!folio_test_##fname(folio), folio); \ + folio->page.page_type |= PG_##lname; \ +} \ /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -PAGE_TYPE_OPS(Buddy, buddy) +PAGE_TYPE_OPS(Buddy, buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the @@ -963,7 +979,7 @@ PAGE_TYPE_OPS(Buddy, buddy) * pages should check PageOffline() and synchronize with such drivers using * page_offline_freeze()/page_offline_thaw(). */ -PAGE_TYPE_OPS(Offline, offline) +PAGE_TYPE_OPS(Offline, offline, offline) extern void page_offline_freeze(void); extern void page_offline_thaw(void); @@ -973,12 +989,12 @@ extern void page_offline_end(void); /* * Marks pages in use as page tables. */ -PAGE_TYPE_OPS(Table, table) +PAGE_TYPE_OPS(Table, table, pgtable) /* * Marks guardpages used with debug_pagealloc. */ -PAGE_TYPE_OPS(Guard, guard) +PAGE_TYPE_OPS(Guard, guard, guard) extern bool is_free_buddy_page(struct page *page); From 9a35de4ffc209bd7956c4811ad17c4883791db43 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:44 -0700 Subject: [PATCH 352/489] pgtable: create struct ptdesc Currently, page table information is stored within struct page. As part of simplifying struct page, create struct ptdesc for page table information. Link: https://lkml.kernel.org/r/20230807230513.102486-3-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 70 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1fc4b9c2c8a68c..d4ebcefe482e4d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -397,6 +397,76 @@ FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); #undef FOLIO_MATCH +/** + * struct ptdesc - Memory descriptor for page tables. + * @__page_flags: Same as page flags. Unused for page tables. + * @pt_rcu_head: For freeing page table pages. + * @pt_list: List of used page tables. Used for s390 and x86. + * @_pt_pad_1: Padding that aliases with page's compound head. + * @pmd_huge_pte: Protected by ptdesc->ptl, used for THPs. + * @__page_mapping: Aliases with page->mapping. Unused for page tables. + * @pt_mm: Used for x86 pgds. + * @pt_frag_refcount: For fragmented page table tracking. Powerpc and s390 only. + * @_pt_pad_2: Padding to ensure proper alignment. + * @ptl: Lock for the page table. + * @__page_type: Same as page->page_type. Unused for page tables. + * @_refcount: Same as page refcount. Used for s390 page tables. + * @pt_memcg_data: Memcg data. Tracked for page tables here. + * + * This struct overlays struct page for now. Do not modify without a good + * understanding of the issues. + */ +struct ptdesc { + unsigned long __page_flags; + + union { + struct rcu_head pt_rcu_head; + struct list_head pt_list; + struct { + unsigned long _pt_pad_1; + pgtable_t pmd_huge_pte; + }; + }; + unsigned long __page_mapping; + + union { + struct mm_struct *pt_mm; + atomic_t pt_frag_refcount; + }; + + union { + unsigned long _pt_pad_2; +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif + }; + unsigned int __page_type; + atomic_t _refcount; +#ifdef CONFIG_MEMCG + unsigned long pt_memcg_data; +#endif +}; + +#define TABLE_MATCH(pg, pt) \ + static_assert(offsetof(struct page, pg) == offsetof(struct ptdesc, pt)) +TABLE_MATCH(flags, __page_flags); +TABLE_MATCH(compound_head, pt_list); +TABLE_MATCH(compound_head, _pt_pad_1); +TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); +TABLE_MATCH(mapping, __page_mapping); +TABLE_MATCH(pt_mm, pt_mm); +TABLE_MATCH(ptl, ptl); +TABLE_MATCH(rcu_head, pt_rcu_head); +TABLE_MATCH(page_type, __page_type); +TABLE_MATCH(_refcount, _refcount); +#ifdef CONFIG_MEMCG +TABLE_MATCH(memcg_data, pt_memcg_data); +#endif +#undef TABLE_MATCH +static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); + /* * Used for sizing the vmemmap region on some architectures */ From bf2d4334f72e4e033166c5a3bf1331a7238eab9d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:45 -0700 Subject: [PATCH 353/489] mm: add utility functions for ptdesc Introduce utility functions setting the foundation for ptdescs. These will also assist in the splitting out of ptdesc from struct page. Functions that focus on the descriptor are prefixed with ptdesc_* while functions that focus on the pagetable are prefixed with pagetable_*. pagetable_alloc() is defined to allocate new ptdesc pages as compound pages. This is to standardize ptdescs by allowing for one allocation and one free function, in contrast to 2 allocation and 2 free functions. Link: https://lkml.kernel.org/r/20230807230513.102486-4-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/tlb.h | 11 +++++++ include/linux/mm.h | 61 +++++++++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 12 ++++++++ 3 files changed, 84 insertions(+) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index bc32a2284c5649..129a3a75997659 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -480,6 +480,17 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } +static inline void tlb_remove_ptdesc(struct mmu_gather *tlb, void *pt) +{ + tlb_remove_table(tlb, pt); +} + +/* Like tlb_remove_ptdesc, but for page-like page directories. */ +static inline void tlb_remove_page_ptdesc(struct mmu_gather *tlb, struct ptdesc *pt) +{ + tlb_remove_page(tlb, ptdesc_page(pt)); +} + static inline void tlb_change_page_size(struct mmu_gather *tlb, unsigned int page_size) { diff --git a/include/linux/mm.h b/include/linux/mm.h index c63ec57a54dc4d..f750587833597b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2772,6 +2772,57 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU */ +static inline struct ptdesc *virt_to_ptdesc(const void *x) +{ + return page_ptdesc(virt_to_page(x)); +} + +static inline void *ptdesc_to_virt(const struct ptdesc *pt) +{ + return page_to_virt(ptdesc_page(pt)); +} + +static inline void *ptdesc_address(const struct ptdesc *pt) +{ + return folio_address(ptdesc_folio(pt)); +} + +static inline bool pagetable_is_reserved(struct ptdesc *pt) +{ + return folio_test_reserved(ptdesc_folio(pt)); +} + +/** + * pagetable_alloc - Allocate pagetables + * @gfp: GFP flags + * @order: desired pagetable order + * + * pagetable_alloc allocates memory for page tables as well as a page table + * descriptor to describe that memory. + * + * Return: The ptdesc describing the allocated page tables. + */ +static inline struct ptdesc *pagetable_alloc(gfp_t gfp, unsigned int order) +{ + struct page *page = alloc_pages(gfp | __GFP_COMP, order); + + return page_ptdesc(page); +} + +/** + * pagetable_free - Free pagetables + * @pt: The page table descriptor + * + * pagetable_free frees the memory of all page tables described by a page + * table descriptor and the memory for the descriptor itself. + */ +static inline void pagetable_free(struct ptdesc *pt) +{ + struct page *page = ptdesc_page(pt); + + __free_pages(page, compound_order(page)); +} + #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); @@ -2898,6 +2949,11 @@ static inline struct page *pmd_pgtable_page(pmd_t *pmd) return virt_to_page((void *)((unsigned long) pmd & mask)); } +static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) +{ + return page_ptdesc(pmd_pgtable_page(pmd)); +} + static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return ptlock_ptr(pmd_pgtable_page(pmd)); @@ -3010,6 +3066,11 @@ static inline void mark_page_reserved(struct page *page) adjust_managed_page_count(page, -1); } +static inline void free_reserved_ptdesc(struct ptdesc *pt) +{ + free_reserved_page(ptdesc_page(pt)); +} + /* * Default method to free all the __init memory into the buddy system. * The freed pages will be poisoned with pattern "poison" if it's within diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d4ebcefe482e4d..c57e940e60d010 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -467,6 +467,18 @@ TABLE_MATCH(memcg_data, pt_memcg_data); #undef TABLE_MATCH static_assert(sizeof(struct ptdesc) <= sizeof(struct page)); +#define ptdesc_page(pt) (_Generic((pt), \ + const struct ptdesc *: (const struct page *)(pt), \ + struct ptdesc *: (struct page *)(pt))) + +#define ptdesc_folio(pt) (_Generic((pt), \ + const struct ptdesc *: (const struct folio *)(pt), \ + struct ptdesc *: (struct folio *)(pt))) + +#define page_ptdesc(p) (_Generic((p), \ + const struct page *: (const struct ptdesc *)(p), \ + struct page *: (struct ptdesc *)(p))) + /* * Used for sizing the vmemmap region on some architectures */ From f8546d8494ca22fe530c8ba79beaf1509b47f6d1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:46 -0700 Subject: [PATCH 354/489] mm: convert pmd_pgtable_page() callers to use pmd_ptdesc() Converts internal pmd_pgtable_page() callers to use pmd_ptdesc(). This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f750587833597b..6fee233dfccc52 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2956,7 +2956,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_pgtable_page(pmd)); + return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd))); } static inline bool pmd_ptlock_init(struct page *page) @@ -2975,7 +2975,7 @@ static inline void pmd_ptlock_free(struct page *page) ptlock_free(page); } -#define pmd_huge_pte(mm, pmd) (pmd_pgtable_page(pmd)->pmd_huge_pte) +#define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) #else From f5ecca06b3a5d0371ee27ee08aa06c686407a8af Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:47 -0700 Subject: [PATCH 355/489] mm: convert ptlock_alloc() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-6-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 6 +++--- mm/memory.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 6fee233dfccc52..ccea0665247ca2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2826,7 +2826,7 @@ static inline void pagetable_free(struct ptdesc *pt) #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); -extern bool ptlock_alloc(struct page *page); +bool ptlock_alloc(struct ptdesc *ptdesc); extern void ptlock_free(struct page *page); static inline spinlock_t *ptlock_ptr(struct page *page) @@ -2838,7 +2838,7 @@ static inline void ptlock_cache_init(void) { } -static inline bool ptlock_alloc(struct page *page) +static inline bool ptlock_alloc(struct ptdesc *ptdesc) { return true; } @@ -2868,7 +2868,7 @@ static inline bool ptlock_init(struct page *page) * slab code uses page->slab_cache, which share storage with page->ptl. */ VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); - if (!ptlock_alloc(page)) + if (!ptlock_alloc(page_ptdesc(page))) return false; spin_lock_init(ptlock_ptr(page)); return true; diff --git a/mm/memory.c b/mm/memory.c index 039dcbbcc7d2ea..b9ba7e99534dde 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6114,14 +6114,14 @@ void __init ptlock_cache_init(void) SLAB_PANIC, NULL); } -bool ptlock_alloc(struct page *page) +bool ptlock_alloc(struct ptdesc *ptdesc) { spinlock_t *ptl; ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); if (!ptl) return false; - page->ptl = ptl; + ptdesc->ptl = ptl; return true; } From 1865484af6b2ced2f367c1042b5f3816c11db148 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:48 -0700 Subject: [PATCH 356/489] mm: convert ptlock_ptr() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-7-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/x86/xen/mmu_pv.c | 2 +- include/linux/mm.h | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index e0a975165de786..8796ec310483cf 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -667,7 +667,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) spinlock_t *ptl = NULL; #if USE_SPLIT_PTE_PTLOCKS - ptl = ptlock_ptr(page); + ptl = ptlock_ptr(page_ptdesc(page)); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index ccea0665247ca2..7860529737d419 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2829,9 +2829,9 @@ void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); extern void ptlock_free(struct page *page); -static inline spinlock_t *ptlock_ptr(struct page *page) +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { - return page->ptl; + return ptdesc->ptl; } #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) @@ -2847,15 +2847,15 @@ static inline void ptlock_free(struct page *page) { } -static inline spinlock_t *ptlock_ptr(struct page *page) +static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { - return &page->ptl; + return &ptdesc->ptl; } #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(pmd_page(*pmd)); + return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } static inline bool ptlock_init(struct page *page) @@ -2870,7 +2870,7 @@ static inline bool ptlock_init(struct page *page) VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); if (!ptlock_alloc(page_ptdesc(page))) return false; - spin_lock_init(ptlock_ptr(page)); + spin_lock_init(ptlock_ptr(page_ptdesc(page))); return true; } @@ -2956,7 +2956,7 @@ static inline struct ptdesc *pmd_ptdesc(pmd_t *pmd) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return ptlock_ptr(ptdesc_page(pmd_ptdesc(pmd))); + return ptlock_ptr(pmd_ptdesc(pmd)); } static inline bool pmd_ptlock_init(struct page *page) From edbaefe53c6418ea11372e6b3ce952ab8caa1f78 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:49 -0700 Subject: [PATCH 357/489] mm: convert pmd_ptlock_init() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-8-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7860529737d419..d70366169c3d62 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2959,12 +2959,12 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(pmd_ptdesc(pmd)); } -static inline bool pmd_ptlock_init(struct page *page) +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - page->pmd_huge_pte = NULL; + ptdesc->pmd_huge_pte = NULL; #endif - return ptlock_init(page); + return ptlock_init(ptdesc_page(ptdesc)); } static inline void pmd_ptlock_free(struct page *page) @@ -2984,7 +2984,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } -static inline bool pmd_ptlock_init(struct page *page) { return true; } +static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void pmd_ptlock_free(struct page *page) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3000,7 +3000,7 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) static inline bool pgtable_pmd_page_ctor(struct page *page) { - if (!pmd_ptlock_init(page)) + if (!pmd_ptlock_init(page_ptdesc(page))) return false; __SetPageTable(page); inc_lruvec_page_state(page, NR_PAGETABLE); From 75b25d49ca6638f9a4eac47cff508b174743d907 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:50 -0700 Subject: [PATCH 358/489] mm: convert ptlock_init() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-9-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d70366169c3d62..e53ef2eb95bbca 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2858,7 +2858,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return ptlock_ptr(page_ptdesc(pmd_page(*pmd))); } -static inline bool ptlock_init(struct page *page) +static inline bool ptlock_init(struct ptdesc *ptdesc) { /* * prep_new_page() initialize page->private (and therefore page->ptl) @@ -2867,10 +2867,10 @@ static inline bool ptlock_init(struct page *page) * It can happen if arch try to use slab for page table allocation: * slab code uses page->slab_cache, which share storage with page->ptl. */ - VM_BUG_ON_PAGE(*(unsigned long *)&page->ptl, page); - if (!ptlock_alloc(page_ptdesc(page))) + VM_BUG_ON_PAGE(*(unsigned long *)&ptdesc->ptl, ptdesc_page(ptdesc)); + if (!ptlock_alloc(ptdesc)) return false; - spin_lock_init(ptlock_ptr(page_ptdesc(page))); + spin_lock_init(ptlock_ptr(ptdesc)); return true; } @@ -2883,13 +2883,13 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } static inline void ptlock_cache_init(void) {} -static inline bool ptlock_init(struct page *page) { return true; } +static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pgtable_pte_page_ctor(struct page *page) { - if (!ptlock_init(page)) + if (!ptlock_init(page_ptdesc(page))) return false; __SetPageTable(page); inc_lruvec_page_state(page, NR_PAGETABLE); @@ -2964,7 +2964,7 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) #ifdef CONFIG_TRANSPARENT_HUGEPAGE ptdesc->pmd_huge_pte = NULL; #endif - return ptlock_init(ptdesc_page(ptdesc)); + return ptlock_init(ptdesc); } static inline void pmd_ptlock_free(struct page *page) From 7e5f42ae3413785c68c383acb787f9ce8f243096 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:51 -0700 Subject: [PATCH 359/489] mm: convert pmd_ptlock_free() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-10-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e53ef2eb95bbca..8884c700dfc62f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2967,12 +2967,12 @@ static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) return ptlock_init(ptdesc); } -static inline void pmd_ptlock_free(struct page *page) +static inline void pmd_ptlock_free(struct ptdesc *ptdesc) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE - VM_BUG_ON_PAGE(page->pmd_huge_pte, page); + VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); #endif - ptlock_free(page); + ptlock_free(ptdesc_page(ptdesc)); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) @@ -2985,7 +2985,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline bool pmd_ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void pmd_ptlock_free(struct page *page) {} +static inline void pmd_ptlock_free(struct ptdesc *ptdesc) {} #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) @@ -3009,7 +3009,7 @@ static inline bool pgtable_pmd_page_ctor(struct page *page) static inline void pgtable_pmd_page_dtor(struct page *page) { - pmd_ptlock_free(page); + pmd_ptlock_free(page_ptdesc(page)); __ClearPageTable(page); dec_lruvec_page_state(page, NR_PAGETABLE); } From 6ed1b8a09deb0b99fd3b54e11535c80284689555 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:52 -0700 Subject: [PATCH 360/489] mm: convert ptlock_free() to use ptdescs This removes some direct accesses to struct page, working towards splitting out struct ptdesc from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-11-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- mm/memory.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 8884c700dfc62f..d0fb31bcd48265 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2827,7 +2827,7 @@ static inline void pagetable_free(struct ptdesc *pt) #if ALLOC_SPLIT_PTLOCKS void __init ptlock_cache_init(void); bool ptlock_alloc(struct ptdesc *ptdesc); -extern void ptlock_free(struct page *page); +void ptlock_free(struct ptdesc *ptdesc); static inline spinlock_t *ptlock_ptr(struct ptdesc *ptdesc) { @@ -2843,7 +2843,7 @@ static inline bool ptlock_alloc(struct ptdesc *ptdesc) return true; } -static inline void ptlock_free(struct page *page) +static inline void ptlock_free(struct ptdesc *ptdesc) { } @@ -2884,7 +2884,7 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) } static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } -static inline void ptlock_free(struct page *page) {} +static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pgtable_pte_page_ctor(struct page *page) @@ -2898,7 +2898,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) static inline void pgtable_pte_page_dtor(struct page *page) { - ptlock_free(page); + ptlock_free(page_ptdesc(page)); __ClearPageTable(page); dec_lruvec_page_state(page, NR_PAGETABLE); } @@ -2972,7 +2972,7 @@ static inline void pmd_ptlock_free(struct ptdesc *ptdesc) #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON_PAGE(ptdesc->pmd_huge_pte, ptdesc_page(ptdesc)); #endif - ptlock_free(ptdesc_page(ptdesc)); + ptlock_free(ptdesc); } #define pmd_huge_pte(mm, pmd) (pmd_ptdesc(pmd)->pmd_huge_pte) diff --git a/mm/memory.c b/mm/memory.c index b9ba7e99534dde..4a7c8be9fe71f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6125,8 +6125,8 @@ bool ptlock_alloc(struct ptdesc *ptdesc) return true; } -void ptlock_free(struct page *page) +void ptlock_free(struct ptdesc *ptdesc) { - kmem_cache_free(page_ptl_cachep, page->ptl); + kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif From 7e11dca14b27e11596a0c49c7d20bc7816cb0508 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:53 -0700 Subject: [PATCH 361/489] mm: create ptdesc equivalents for pgtable_{pte,pmd}_page_{ctor,dtor} Create pagetable_pte_ctor(), pagetable_pmd_ctor(), pagetable_pte_dtor(), and pagetable_pmd_dtor() and make the original pgtable constructor/destructors wrappers. Link: https://lkml.kernel.org/r/20230807230513.102486-12-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm.h | 56 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 14 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d0fb31bcd48265..6fdc294ada0d8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2887,20 +2887,34 @@ static inline bool ptlock_init(struct ptdesc *ptdesc) { return true; } static inline void ptlock_free(struct ptdesc *ptdesc) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ -static inline bool pgtable_pte_page_ctor(struct page *page) +static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) { - if (!ptlock_init(page_ptdesc(page))) + struct folio *folio = ptdesc_folio(ptdesc); + + if (!ptlock_init(ptdesc)) return false; - __SetPageTable(page); - inc_lruvec_page_state(page, NR_PAGETABLE); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } +static inline bool pgtable_pte_page_ctor(struct page *page) +{ + return pagetable_pte_ctor(page_ptdesc(page)); +} + +static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline void pgtable_pte_page_dtor(struct page *page) { - ptlock_free(page_ptdesc(page)); - __ClearPageTable(page); - dec_lruvec_page_state(page, NR_PAGETABLE); + pagetable_pte_dtor(page_ptdesc(page)); } pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); @@ -2998,20 +3012,34 @@ static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) return ptl; } -static inline bool pgtable_pmd_page_ctor(struct page *page) +static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) { - if (!pmd_ptlock_init(page_ptdesc(page))) + struct folio *folio = ptdesc_folio(ptdesc); + + if (!pmd_ptlock_init(ptdesc)) return false; - __SetPageTable(page); - inc_lruvec_page_state(page, NR_PAGETABLE); + __folio_set_pgtable(folio); + lruvec_stat_add_folio(folio, NR_PAGETABLE); return true; } +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + return pagetable_pmd_ctor(page_ptdesc(page)); +} + +static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) +{ + struct folio *folio = ptdesc_folio(ptdesc); + + pmd_ptlock_free(ptdesc); + __folio_clear_pgtable(folio); + lruvec_stat_sub_folio(folio, NR_PAGETABLE); +} + static inline void pgtable_pmd_page_dtor(struct page *page) { - pmd_ptlock_free(page_ptdesc(page)); - __ClearPageTable(page); - dec_lruvec_page_state(page, NR_PAGETABLE); + pagetable_pmd_dtor(page_ptdesc(page)); } /* From 4eaca96140b33e2d1fad761d1468c8293859da11 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:54 -0700 Subject: [PATCH 362/489] powerpc: convert various functions to use ptdescs In order to split struct ptdesc from struct page, convert various functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-13-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/mmu_context.c | 10 ++--- arch/powerpc/mm/book3s64/pgtable.c | 32 +++++++------- arch/powerpc/mm/pgtable-frag.c | 58 +++++++++++++------------- 3 files changed, 50 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/mm/book3s64/mmu_context.c b/arch/powerpc/mm/book3s64/mmu_context.c index c766e4c26e42dd..1715b07c630c98 100644 --- a/arch/powerpc/mm/book3s64/mmu_context.c +++ b/arch/powerpc/mm/book3s64/mmu_context.c @@ -246,15 +246,15 @@ static void destroy_contexts(mm_context_t *ctx) static void pmd_frag_destroy(void *pmd_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pmd_frag); + ptdesc = virt_to_ptdesc(pmd_frag); /* drop all the pending references */ count = ((unsigned long)pmd_frag & ~PAGE_MASK) >> PMD_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PMD_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PMD_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 75b938268b0409..1498ccd08367dc 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -384,22 +384,22 @@ static pmd_t *get_pmd_from_cache(struct mm_struct *mm) static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_pages(page, 0); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -409,12 +409,12 @@ static pmd_t *__alloc_for_pmdcache(struct mm_struct *mm) spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!mm->context.pmd_frag)) { - atomic_set(&page->pt_frag_refcount, PMD_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PMD_FRAG_NR); mm->context.pmd_frag = ret + PMD_FRAG_SIZE; } spin_unlock(&mm->page_table_lock); @@ -435,15 +435,15 @@ pmd_t *pmd_fragment_alloc(struct mm_struct *mm, unsigned long vmaddr) void pmd_fragment_free(unsigned long *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { - pgtable_pmd_page_dtor(page); - __free_page(page); + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } } diff --git a/arch/powerpc/mm/pgtable-frag.c b/arch/powerpc/mm/pgtable-frag.c index 0c6b681300256c..8c31802f97e82e 100644 --- a/arch/powerpc/mm/pgtable-frag.c +++ b/arch/powerpc/mm/pgtable-frag.c @@ -18,15 +18,15 @@ void pte_frag_destroy(void *pte_frag) { int count; - struct page *page; + struct ptdesc *ptdesc; - page = virt_to_page(pte_frag); + ptdesc = virt_to_ptdesc(pte_frag); /* drop all the pending references */ count = ((unsigned long)pte_frag & ~PAGE_MASK) >> PTE_FRAG_SIZE_SHIFT; /* We allow PTE_FRAG_NR fragments from a PTE page */ - if (atomic_sub_and_test(PTE_FRAG_NR - count, &page->pt_frag_refcount)) { - pgtable_pte_page_dtor(page); - __free_page(page); + if (atomic_sub_and_test(PTE_FRAG_NR - count, &ptdesc->pt_frag_refcount)) { + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } } @@ -55,25 +55,25 @@ static pte_t *get_pte_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page; + struct ptdesc *ptdesc; if (!kernel) { - page = alloc_page(PGALLOC_GFP | __GFP_ACCOUNT); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP | __GFP_ACCOUNT, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } } else { - page = alloc_page(PGALLOC_GFP); - if (!page) + ptdesc = pagetable_alloc(PGALLOC_GFP, 0); + if (!ptdesc) return NULL; } - atomic_set(&page->pt_frag_refcount, 1); + atomic_set(&ptdesc->pt_frag_refcount, 1); - ret = page_address(page); + ret = ptdesc_address(ptdesc); /* * if we support only one fragment just return the * allocated page. @@ -82,12 +82,12 @@ static pte_t *__alloc_for_ptecache(struct mm_struct *mm, int kernel) return ret; spin_lock(&mm->page_table_lock); /* - * If we find pgtable_page set, we return + * If we find ptdesc_page set, we return * the allocated page with single fragment * count. */ if (likely(!pte_frag_get(&mm->context))) { - atomic_set(&page->pt_frag_refcount, PTE_FRAG_NR); + atomic_set(&ptdesc->pt_frag_refcount, PTE_FRAG_NR); pte_frag_set(&mm->context, ret + PTE_FRAG_SIZE); } spin_unlock(&mm->page_table_lock); @@ -108,28 +108,28 @@ pte_t *pte_fragment_alloc(struct mm_struct *mm, int kernel) static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void pte_fragment_free(unsigned long *table, int kernel) { - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); - if (PageReserved(page)) - return free_reserved_page(page); + if (pagetable_is_reserved(ptdesc)) + return free_reserved_ptdesc(ptdesc); - BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); - if (atomic_dec_and_test(&page->pt_frag_refcount)) { + BUG_ON(atomic_read(&ptdesc->pt_frag_refcount) <= 0); + if (atomic_dec_and_test(&ptdesc->pt_frag_refcount)) { if (kernel) - __free_page(page); - else if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + pagetable_free(ptdesc); + else if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } } From f92c494f420a9fdb253861089f615e6e977aa62d Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:55 -0700 Subject: [PATCH 363/489] x86: convert various functions to use ptdescs In order to split struct ptdesc from struct page, convert various functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-14-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/x86/mm/pgtable.c | 47 ++++++++++++++++++++++++++----------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 15a8009a4480a9..d3a93e8766eeae 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -52,7 +52,7 @@ early_param("userpte", setup_userpte); void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { - pgtable_pte_page_dtor(pte); + pagetable_pte_dtor(page_ptdesc(pte)); paravirt_release_pte(page_to_pfn(pte)); paravirt_tlb_remove_table(tlb, pte); } @@ -60,7 +60,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) #if CONFIG_PGTABLE_LEVELS > 2 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - struct page *page = virt_to_page(pmd); + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); /* * NOTE! For PAE, any changes to the top page-directory-pointer-table @@ -69,8 +69,8 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) #ifdef CONFIG_X86_PAE tlb->need_flush_all = 1; #endif - pgtable_pmd_page_dtor(page); - paravirt_tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); } #if CONFIG_PGTABLE_LEVELS > 3 @@ -92,16 +92,16 @@ void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) static inline void pgd_list_add(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_add(&page->lru, &pgd_list); + list_add(&ptdesc->pt_list, &pgd_list); } static inline void pgd_list_del(pgd_t *pgd) { - struct page *page = virt_to_page(pgd); + struct ptdesc *ptdesc = virt_to_ptdesc(pgd); - list_del(&page->lru); + list_del(&ptdesc->pt_list); } #define UNSHARED_PTRS_PER_PGD \ @@ -112,12 +112,12 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - virt_to_page(pgd)->pt_mm = mm; + virt_to_ptdesc(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return page->pt_mm; + return page_ptdesc(page)->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) @@ -213,11 +213,14 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) static void free_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) { int i; + struct ptdesc *ptdesc; for (i = 0; i < count; i++) if (pmds[i]) { - pgtable_pmd_page_dtor(virt_to_page(pmds[i])); - free_page((unsigned long)pmds[i]); + ptdesc = virt_to_ptdesc(pmds[i]); + + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); mm_dec_nr_pmds(mm); } } @@ -230,18 +233,24 @@ static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[], int count) if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; + gfp &= ~__GFP_HIGHMEM; for (i = 0; i < count; i++) { - pmd_t *pmd = (pmd_t *)__get_free_page(gfp); - if (!pmd) + pmd_t *pmd = NULL; + struct ptdesc *ptdesc = pagetable_alloc(gfp, 0); + + if (!ptdesc) failed = true; - if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) { - free_page((unsigned long)pmd); - pmd = NULL; + if (ptdesc && !pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); + ptdesc = NULL; failed = true; } - if (pmd) + if (ptdesc) { mm_inc_nr_pmds(mm); + pmd = ptdesc_address(ptdesc); + } + pmds[i] = pmd; } @@ -830,7 +839,7 @@ int pud_free_pmd_page(pud_t *pud, unsigned long addr) free_page((unsigned long)pmd_sv); - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); free_page((unsigned long)pmd); return 1; From 6326c26c1514757242829b292b26eac589013200 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:56 -0700 Subject: [PATCH 364/489] s390: convert various pgalloc functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-15-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgalloc.h | 4 +- arch/s390/include/asm/tlb.h | 4 +- arch/s390/mm/pgalloc.c | 128 ++++++++++++++++---------------- 3 files changed, 69 insertions(+), 67 deletions(-) diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index 89a9d5ef94f866..376b4b23bdaa34 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -86,7 +86,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) if (!table) return NULL; crst_table_init(table, _SEGMENT_ENTRY_EMPTY); - if (!pgtable_pmd_page_ctor(virt_to_page(table))) { + if (!pagetable_pmd_ctor(virt_to_ptdesc(table))) { crst_table_free(mm, table); return NULL; } @@ -97,7 +97,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { if (mm_pmd_folded(mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); crst_table_free(mm, (unsigned long *) pmd); } diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b91f4a9b044cd9..383b1f91442c99 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -89,12 +89,12 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, { if (mm_pmd_folded(tlb->mm)) return; - pgtable_pmd_page_dtor(virt_to_page(pmd)); + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); __tlb_adjust_range(tlb, address, PAGE_SIZE); tlb->mm->context.flush_mm = 1; tlb->freed_tables = 1; tlb->cleared_puds = 1; - tlb_remove_table(tlb, pmd); + tlb_remove_ptdesc(tlb, pmd); } /* diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index d7374add78209e..07fc660a24aa2f 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -43,17 +43,17 @@ __initcall(page_table_register_sysctl); unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL, CRST_ALLOC_ORDER); - if (!page) + if (!ptdesc) return NULL; - arch_set_page_dat(page, CRST_ALLOC_ORDER); - return (unsigned long *) page_to_virt(page); + arch_set_page_dat(ptdesc_page(ptdesc), CRST_ALLOC_ORDER); + return (unsigned long *) ptdesc_to_virt(ptdesc); } void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } static void __crst_table_upgrade(void *arg) @@ -140,21 +140,21 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) struct page *page_table_alloc_pgste(struct mm_struct *mm) { - struct page *page; + struct ptdesc *ptdesc; u64 *table; - page = alloc_page(GFP_KERNEL); - if (page) { - table = (u64 *)page_to_virt(page); + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (ptdesc) { + table = (u64 *)ptdesc_to_virt(ptdesc); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } - return page; + return ptdesc_page(ptdesc); } void page_table_free_pgste(struct page *page) { - __free_page(page); + pagetable_free(page_ptdesc(page)); } #endif /* CONFIG_PGSTE */ @@ -242,7 +242,7 @@ void page_table_free_pgste(struct page *page) unsigned long *page_table_alloc(struct mm_struct *mm) { unsigned long *table; - struct page *page; + struct ptdesc *ptdesc; unsigned int mask, bit; /* Try to get a fragment of a 4K page as a 2K page table */ @@ -250,9 +250,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = NULL; spin_lock_bh(&mm->context.lock); if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - mask = atomic_read(&page->_refcount) >> 24; + ptdesc = list_first_entry(&mm->context.pgtable_list, + struct ptdesc, pt_list); + mask = atomic_read(&ptdesc->_refcount) >> 24; /* * The pending removal bits must also be checked. * Failure to do so might lead to an impossible @@ -264,13 +264,13 @@ unsigned long *page_table_alloc(struct mm_struct *mm) */ mask = (mask | (mask >> 4)) & 0x03U; if (mask != 0x03U) { - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_refcount, + atomic_xor_bits(&ptdesc->_refcount, 0x01U << (bit + 24)); - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } } spin_unlock_bh(&mm->context.lock); @@ -278,28 +278,28 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } /* Allocate a fresh page */ - page = alloc_page(GFP_KERNEL); - if (!page) + ptdesc = pagetable_alloc(GFP_KERNEL, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - arch_set_page_dat(page, 0); + arch_set_page_dat(ptdesc_page(ptdesc), 0); /* Initialize page table */ - table = (unsigned long *) page_to_virt(page); + table = (unsigned long *) ptdesc_to_virt(ptdesc); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - INIT_LIST_HEAD(&page->lru); - atomic_xor_bits(&page->_refcount, 0x03U << 24); + INIT_LIST_HEAD(&ptdesc->pt_list); + atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 0x01U << 24); + atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); spin_unlock_bh(&mm->context.lock); } return table; @@ -322,19 +322,18 @@ static void page_table_release_check(struct page *page, void *table, static void pte_free_now(struct rcu_head *head) { - struct page *page; + struct ptdesc *ptdesc; - page = container_of(head, struct page, rcu_head); - pgtable_pte_page_dtor(page); - __free_page(page); + ptdesc = container_of(head, struct ptdesc, pt_rcu_head); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void page_table_free(struct mm_struct *mm, unsigned long *table) { unsigned int mask, bit, half; - struct page *page; + struct ptdesc *ptdesc = virt_to_ptdesc(table); - page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); @@ -344,51 +343,50 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) * will happen outside of the critical section from this * function or from __tlb_remove_table() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to head of list, to make * this freed half available for immediate reuse. */ - list_add(&page->lru, &mm->context.pgtable_list); + list_add(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24)); mask >>= 24; if (mask != 0x00U) return; half = 0x01U << bit; } else { half = 0x03U; - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned long vmaddr) { struct mm_struct *mm; - struct page *page; unsigned int bit, mask; + struct ptdesc *ptdesc = virt_to_ptdesc(table); mm = tlb->mm; - page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); table = (unsigned long *) ((unsigned long)table | 0x03U); - tlb_remove_table(tlb, table); + tlb_remove_ptdesc(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); @@ -398,19 +396,19 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, * outside of the critical section from __tlb_remove_table() or from * page_table_free() */ - mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if ((mask & 0x03U) && !PageActive(page)) { + if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) { /* * Other half is allocated, and neither half has had * its free deferred: add page to end of list, to make * this freed half available for reuse once its pending * bit has been cleared by __tlb_remove_table(). */ - list_add_tail(&page->lru, &mm->context.pgtable_list); + list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list); } else { /* If page is on list, now remove it. */ - list_del_init(&page->lru); + list_del_init(&ptdesc->pt_list); } spin_unlock_bh(&mm->context.lock); table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); @@ -421,30 +419,30 @@ void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = virt_to_page(table); + struct ptdesc *ptdesc = virt_to_ptdesc(table); switch (half) { case 0x00U: /* pmd, pud, or p4d */ - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(ptdesc); return; case 0x01U: /* lower 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */ - mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24)); mask >>= 24; if (mask != 0x00U) return; break; case 0x03U: /* 4K page table with pgstes */ - mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24); mask >>= 24; break; } - page_table_release_check(page, table, half, mask); - if (TestClearPageActive(page)) - call_rcu(&page->rcu_head, pte_free_now); + page_table_release_check(ptdesc_page(ptdesc), table, half, mask); + if (folio_test_clear_active(ptdesc_folio(ptdesc))) + call_rcu(&ptdesc->pt_rcu_head, pte_free_now); else - pte_free_now(&page->rcu_head); + pte_free_now(&ptdesc->pt_rcu_head); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -488,16 +486,20 @@ static void base_pgt_free(unsigned long *table) static unsigned long *base_crst_alloc(unsigned long val) { unsigned long *table; + struct ptdesc *ptdesc; - table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); - if (table) - crst_table_init(table, val); + ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, CRST_ALLOC_ORDER); + if (!ptdesc) + return NULL; + table = ptdesc_address(ptdesc); + + crst_table_init(table, val); return table; } static void base_crst_free(unsigned long *table) { - free_pages((unsigned long)table, CRST_ALLOC_ORDER); + pagetable_free(virt_to_ptdesc(table)); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ From 4f054c28f425b2f1623c9fdc2c2f69719a22190b Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:57 -0700 Subject: [PATCH 365/489] mm: remove page table members from struct page The page table members are now split out into their own ptdesc struct. Remove them from struct page. Link: https://lkml.kernel.org/r/20230807230513.102486-16-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c57e940e60d010..369b7fd35d03f4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -141,24 +141,6 @@ struct page { struct { /* Tail pages of compound page */ unsigned long compound_head; /* Bit zero is set */ }; - struct { /* Page table pages */ - unsigned long _pt_pad_1; /* compound_head */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - /* - * A PTE page table page might be freed by use of - * rcu_head: which overlays those two fields above. - */ - unsigned long _pt_pad_2; /* mapping */ - union { - struct mm_struct *pt_mm; /* x86 pgds only */ - atomic_t pt_frag_refcount; /* powerpc */ - }; -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif - }; struct { /* ZONE_DEVICE pages */ /** @pgmap: Points to the hosting device page map. */ struct dev_pagemap *pgmap; @@ -454,10 +436,7 @@ struct ptdesc { TABLE_MATCH(flags, __page_flags); TABLE_MATCH(compound_head, pt_list); TABLE_MATCH(compound_head, _pt_pad_1); -TABLE_MATCH(pmd_huge_pte, pmd_huge_pte); TABLE_MATCH(mapping, __page_mapping); -TABLE_MATCH(pt_mm, pt_mm); -TABLE_MATCH(ptl, ptl); TABLE_MATCH(rcu_head, pt_rcu_head); TABLE_MATCH(page_type, __page_type); TABLE_MATCH(_refcount, _refcount); From c787ae5b391496f4f63bc942c18eb9fdee05741f Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:58 -0700 Subject: [PATCH 366/489] pgalloc: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-17-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Mike Rapoport (IBM) Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- include/asm-generic/pgalloc.h | 88 +++++++++++++++++++++-------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index a7cf825befaef0..c75d4a75384939 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -8,7 +8,7 @@ #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) /** - * __pte_alloc_one_kernel - allocate a page for PTE-level kernel page table + * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * This function is intended for architectures that need @@ -18,12 +18,17 @@ */ static inline pte_t *__pte_alloc_one_kernel(struct mm_struct *mm) { - return (pte_t *)__get_free_page(GFP_PGTABLE_KERNEL); + struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & + ~__GFP_HIGHMEM, 0); + + if (!ptdesc) + return NULL; + return ptdesc_address(ptdesc); } #ifndef __HAVE_ARCH_PTE_ALLOC_ONE_KERNEL /** - * pte_alloc_one_kernel - allocate a page for PTE-level kernel page table + * pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context * * Return: pointer to the allocated memory or %NULL on error @@ -35,40 +40,40 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) #endif /** - * pte_free_kernel - free PTE-level kernel page table page + * pte_free_kernel - free PTE-level kernel page table memory * @mm: the mm_struct of the current context * @pte: pointer to the memory containing the page table */ static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - free_page((unsigned long)pte); + pagetable_free(virt_to_ptdesc(pte)); } /** - * __pte_alloc_one - allocate a page for PTE-level user page table + * __pte_alloc_one - allocate memory for a PTE-level user page table * @mm: the mm_struct of the current context * @gfp: GFP flags to use for the allocation * - * Allocates a page and runs the pgtable_pte_page_ctor(). + * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * * This function is intended for architectures that need * anything beyond simple page allocation or must have custom GFP flags. * - * Return: `struct page` initialized as page table or %NULL on error + * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) { - struct page *pte; + struct ptdesc *ptdesc; - pte = alloc_page(gfp); - if (!pte) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(pte)) { - __free_page(pte); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - return pte; + return ptdesc_page(ptdesc); } #ifndef __HAVE_ARCH_PTE_ALLOC_ONE @@ -76,9 +81,9 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) * pte_alloc_one - allocate a page for PTE-level user page table * @mm: the mm_struct of the current context * - * Allocates a page and runs the pgtable_pte_page_ctor(). + * Allocate memory for a page table and ptdesc and runs pagetable_pte_ctor(). * - * Return: `struct page` initialized as page table or %NULL on error + * Return: `struct page` referencing the ptdesc or %NULL on error */ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { @@ -92,14 +97,16 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) */ /** - * pte_free - free PTE-level user page table page + * pte_free - free PTE-level user page table memory * @mm: the mm_struct of the current context - * @pte_page: the `struct page` representing the page table + * @pte_page: the `struct page` referencing the ptdesc */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { - pgtable_pte_page_dtor(pte_page); - __free_page(pte_page); + struct ptdesc *ptdesc = page_ptdesc(pte_page); + + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } @@ -107,10 +114,11 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) #ifndef __HAVE_ARCH_PMD_ALLOC_ONE /** - * pmd_alloc_one - allocate a page for PMD-level page table + * pmd_alloc_one - allocate memory for a PMD-level page table * @mm: the mm_struct of the current context * - * Allocates a page and runs the pgtable_pmd_page_ctor(). + * Allocate memory for a page table and ptdesc and runs pagetable_pmd_ctor(). + * * Allocations use %GFP_PGTABLE_USER in user context and * %GFP_PGTABLE_KERNEL in kernel context. * @@ -118,28 +126,30 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) */ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - struct page *page; + struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - page = alloc_page(gfp); - if (!page) + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(page)) { - __free_page(page); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - return (pmd_t *)page_address(page); + return ptdesc_address(ptdesc); } #endif #ifndef __HAVE_ARCH_PMD_FREE static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { + struct ptdesc *ptdesc = virt_to_ptdesc(pmd); + BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - pgtable_pmd_page_dtor(virt_to_page(pmd)); - free_page((unsigned long)pmd); + pagetable_pmd_dtor(ptdesc); + pagetable_free(ptdesc); } #endif @@ -150,19 +160,25 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) static inline pud_t *__pud_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_PGTABLE_USER; + struct ptdesc *ptdesc; if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; - return (pud_t *)get_zeroed_page(gfp); + gfp &= ~__GFP_HIGHMEM; + + ptdesc = pagetable_alloc(gfp, 0); + if (!ptdesc) + return NULL; + return ptdesc_address(ptdesc); } #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** - * pud_alloc_one - allocate a page for PUD-level page table + * pud_alloc_one - allocate memory for a PUD-level page table * @mm: the mm_struct of the current context * - * Allocates a page using %GFP_PGTABLE_USER for user context and - * %GFP_PGTABLE_KERNEL for kernel context. + * Allocate memory for a page table using %GFP_PGTABLE_USER for user context + * and %GFP_PGTABLE_KERNEL for kernel context. * * Return: pointer to the allocated memory or %NULL on error */ @@ -175,7 +191,7 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) static inline void __pud_free(struct mm_struct *mm, pud_t *pud) { BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - free_page((unsigned long)pud); + pagetable_free(virt_to_ptdesc(pud)); } #ifndef __HAVE_ARCH_PUD_FREE @@ -190,7 +206,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) #ifndef __HAVE_ARCH_PGD_FREE static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_page((unsigned long)pgd); + pagetable_free(virt_to_ptdesc(pgd)); } #endif From 358d1c39c82afaed58778633f6ed76c8fe9dbf9c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:04:59 -0700 Subject: [PATCH 367/489] arm: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. late_alloc() also uses the __get_free_pages() helper function. Convert this to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-18-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm/include/asm/tlb.h | 12 +++++++----- arch/arm/mm/mmu.c | 7 ++++--- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index b8cbe03ad26034..f40d06ad5d2a34 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -39,7 +39,9 @@ static inline void __tlb_remove_table(void *_table) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_pte_page_dtor(pte); + struct ptdesc *ptdesc = page_ptdesc(pte); + + pagetable_pte_dtor(ptdesc); #ifndef CONFIG_ARM_LPAE /* @@ -50,17 +52,17 @@ __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) __tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE); #endif - tlb_remove_table(tlb, pte); + tlb_remove_ptdesc(tlb, ptdesc); } static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { #ifdef CONFIG_ARM_LPAE - struct page *page = virt_to_page(pmdp); + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); #endif } diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 13fc4bb5f7924b..fdeaee30d167bf 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -737,11 +737,12 @@ static void __init *early_alloc(unsigned long sz) static void *__init late_alloc(unsigned long sz) { - void *ptr = (void *)__get_free_pages(GFP_PGTABLE_KERNEL, get_order(sz)); + void *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_HIGHMEM, + get_order(sz)); - if (!ptr || !pgtable_pte_page_ctor(virt_to_page(ptr))) + if (!ptdesc || !pagetable_pte_ctor(ptdesc)) BUG(); - return ptr; + return ptdesc_to_virt(ptdesc); } static pte_t * __init arm_pte_alloc(pmd_t *pmd, unsigned long addr, From 11b4fa8b2a56c0e7b9db4fe21c134a4cba414657 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:00 -0700 Subject: [PATCH 368/489] arm64: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-19-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Catalin Marinas Cc: Arnd Bergmann Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/arm64/include/asm/tlb.h | 14 ++++++++------ arch/arm64/mm/mmu.c | 7 ++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index c995d1f4594f66..2c29239d05c3e1 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -75,18 +75,20 @@ static inline void tlb_flush(struct mmu_gather *tlb) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { - pgtable_pte_page_dtor(pte); - tlb_remove_table(tlb, pte); + struct ptdesc *ptdesc = page_ptdesc(pte); + + pagetable_pte_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #if CONFIG_PGTABLE_LEVELS > 2 static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { - struct page *page = virt_to_page(pmdp); + struct ptdesc *ptdesc = virt_to_ptdesc(pmdp); - pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + pagetable_pmd_dtor(ptdesc); + tlb_remove_ptdesc(tlb, ptdesc); } #endif @@ -94,7 +96,7 @@ static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pudp, unsigned long addr) { - tlb_remove_table(tlb, virt_to_page(pudp)); + tlb_remove_ptdesc(tlb, virt_to_ptdesc(pudp)); } #endif diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 95d360805f8aeb..47781bec61719d 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -426,6 +426,7 @@ static phys_addr_t __pgd_pgtable_alloc(int shift) static phys_addr_t pgd_pgtable_alloc(int shift) { phys_addr_t pa = __pgd_pgtable_alloc(shift); + struct ptdesc *ptdesc = page_ptdesc(phys_to_page(pa)); /* * Call proper page table ctor in case later we need to @@ -433,12 +434,12 @@ static phys_addr_t pgd_pgtable_alloc(int shift) * this pre-allocated page table. * * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is - * folded, and if so pgtable_pmd_page_ctor() becomes nop. + * folded, and if so pagetable_pte_ctor() becomes nop. */ if (shift == PAGE_SHIFT) - BUG_ON(!pgtable_pte_page_ctor(phys_to_page(pa))); + BUG_ON(!pagetable_pte_ctor(ptdesc)); else if (shift == PMD_SHIFT) - BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); + BUG_ON(!pagetable_pmd_ctor(ptdesc)); return pa; } From e647333995dde9c0b89369a4c23b9e410a080825 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:01 -0700 Subject: [PATCH 369/489] csky: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-20-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Guo Ren Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/csky/include/asm/pgalloc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/csky/include/asm/pgalloc.h b/arch/csky/include/asm/pgalloc.h index 7d57e5da091464..9c84c9012e5342 100644 --- a/arch/csky/include/asm/pgalloc.h +++ b/arch/csky/include/asm/pgalloc.h @@ -63,8 +63,8 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #define __pte_free_tlb(tlb, pte, address) \ do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page(tlb, pte); \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc(tlb, page_ptdesc(pte)); \ } while (0) extern void pagetable_init(void); From b45a12c0070a1d3e7666921f321e390df064f372 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:02 -0700 Subject: [PATCH 370/489] hexagon: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-21-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/pgalloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index f0c47e6a7427d4..55988625e6fbc9 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -87,10 +87,10 @@ static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, max_kernel_seg = pmdindex; } -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pgtable_pte_page_dtor((pte)); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor((page_ptdesc(pte))); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif From 382739797f79ec2a0ca28d3f00b2559b4905f94e Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:03 -0700 Subject: [PATCH 371/489] loongarch: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-22-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgalloc.h | 27 +++++++++++++++------------ arch/loongarch/mm/pgtable.c | 7 ++++--- 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/arch/loongarch/include/asm/pgalloc.h b/arch/loongarch/include/asm/pgalloc.h index af1d1e4a696595..23f5b1107246a3 100644 --- a/arch/loongarch/include/asm/pgalloc.h +++ b/arch/loongarch/include/asm/pgalloc.h @@ -45,9 +45,9 @@ extern void pagetable_init(void); extern pgd_t *pgd_alloc(struct mm_struct *mm); #define __pte_free_tlb(tlb, pte, address) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) #ifndef __PAGETABLE_PMD_FOLDED @@ -55,18 +55,18 @@ do { \ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; - struct page *pg; + struct ptdesc *ptdesc; - pg = alloc_page(GFP_KERNEL_ACCOUNT); - if (!pg) + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, 0); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(pg)) { - __free_page(pg); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - pmd = (pmd_t *)page_address(pg); + pmd = ptdesc_address(ptdesc); pmd_init(pmd); return pmd; } @@ -80,10 +80,13 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - pud = (pud_t *) __get_free_page(GFP_KERNEL); - if (pud) - pud_init(pud); + if (!ptdesc) + return NULL; + pud = ptdesc_address(ptdesc); + + pud_init(pud); return pud; } diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 36a6dc0148aef2..5bd102b51f7c81 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -11,10 +11,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret, *init; + pgd_t *init, *ret = NULL; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - ret = (pgd_t *) __get_free_page(GFP_KERNEL); - if (ret) { + if (ptdesc) { + ret = (pgd_t *)ptdesc_address(ptdesc); init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, From bff28e6bd08e65ac5540de571e9dfd33f7481148 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:04 -0700 Subject: [PATCH 372/489] m68k: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-23-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Geert Uytterhoeven Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/m68k/include/asm/mcf_pgalloc.h | 47 ++++++++++++++-------------- arch/m68k/include/asm/sun3_pgalloc.h | 8 ++--- arch/m68k/mm/motorola.c | 4 +-- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index 5c2c0a864524cc..302c5bf67179e1 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -5,22 +5,22 @@ #include #include -extern inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) { - free_page((unsigned long) pte); + pagetable_free(virt_to_ptdesc(pte)); } extern const char bad_pmd_string[]; -extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) +static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm) { - unsigned long page = __get_free_page(GFP_DMA); + struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_ZERO) & + ~__GFP_HIGHMEM, 0); - if (!page) + if (!ptdesc) return NULL; - memset((void *)page, 0, PAGE_SIZE); - return (pte_t *) (page); + return ptdesc_address(ptdesc); } extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) @@ -35,36 +35,34 @@ extern inline pmd_t *pmd_alloc_kernel(pgd_t *pgd, unsigned long address) static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pgtable, unsigned long address) { - struct page *page = virt_to_page(pgtable); + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_DMA, 0); + struct ptdesc *ptdesc = pagetable_alloc(GFP_DMA | __GFP_ZERO, 0); pte_t *pte; - if (!page) + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - pte = page_address(page); - clear_page(pte); - + pte = ptdesc_address(ptdesc); return pte; } static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) { - struct page *page = virt_to_page(pgtable); + struct ptdesc *ptdesc = virt_to_ptdesc(pgtable); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } /* @@ -75,16 +73,19 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pgtable) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_page((unsigned long) pgd); + pagetable_free(virt_to_ptdesc(pgd)); } static inline pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *new_pgd; + struct ptdesc *ptdesc = pagetable_alloc((GFP_DMA | __GFP_NOWARN) & + ~__GFP_HIGHMEM, 0); - new_pgd = (pgd_t *)__get_free_page(GFP_DMA | __GFP_NOWARN); - if (!new_pgd) + if (!ptdesc) return NULL; + new_pgd = ptdesc_address(ptdesc); + memcpy(new_pgd, swapper_pg_dir, PTRS_PER_PGD * sizeof(pgd_t)); memset(new_pgd, 0, PAGE_OFFSET >> PGDIR_SHIFT); return new_pgd; diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 198036aff51934..ff48573db2c04c 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -17,10 +17,10 @@ extern const char bad_pmd_string[]; -#define __pte_free_tlb(tlb,pte,addr) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index c75984e2d86b9f..594575a0780c12 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -161,7 +161,7 @@ void *get_pointer_table(int type) * m68k doesn't have SPLIT_PTE_PTLOCKS for not having * SMP. */ - pgtable_pte_page_ctor(virt_to_page(page)); + pagetable_pte_ctor(virt_to_ptdesc(page)); } mmu_page_ctor(page); @@ -201,7 +201,7 @@ int free_pointer_table(void *table, int type) list_del(dp); mmu_page_dtor((void *)page); if (type == TABLE_PTE) - pgtable_pte_page_dtor(virt_to_page((void *)page)); + pagetable_pte_dtor(virt_to_ptdesc((void *)page)); free_page (page); return 1; } else if (ptable_list[type].next != dp) { From 3e14fb19ad7cef7a6e998caabed3de4232a3f257 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:05 -0700 Subject: [PATCH 373/489] mips: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-24-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/mips/include/asm/pgalloc.h | 32 ++++++++++++++++++-------------- arch/mips/mm/pgtable.c | 8 +++++--- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index f72e737dda214f..40e40a7eb94af5 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -51,13 +51,13 @@ extern pgd_t *pgd_alloc(struct mm_struct *mm); static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - free_pages((unsigned long)pgd, PGD_TABLE_ORDER); + pagetable_free(virt_to_ptdesc(pgd)); } -#define __pte_free_tlb(tlb,pte,address) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +#define __pte_free_tlb(tlb, pte, address) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte)); \ } while (0) #ifndef __PAGETABLE_PMD_FOLDED @@ -65,18 +65,18 @@ do { \ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; - struct page *pg; + struct ptdesc *ptdesc; - pg = alloc_pages(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); - if (!pg) + ptdesc = pagetable_alloc(GFP_KERNEL_ACCOUNT, PMD_TABLE_ORDER); + if (!ptdesc) return NULL; - if (!pgtable_pmd_page_ctor(pg)) { - __free_pages(pg, PMD_TABLE_ORDER); + if (!pagetable_pmd_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - pmd = (pmd_t *)page_address(pg); + pmd = ptdesc_address(ptdesc); pmd_init(pmd); return pmd; } @@ -90,10 +90,14 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long address) { pud_t *pud; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, + PUD_TABLE_ORDER); - pud = (pud_t *) __get_free_pages(GFP_KERNEL, PUD_TABLE_ORDER); - if (pud) - pud_init(pud); + if (!ptdesc) + return NULL; + pud = ptdesc_address(ptdesc); + + pud_init(pud); return pud; } diff --git a/arch/mips/mm/pgtable.c b/arch/mips/mm/pgtable.c index b13314be5d0e57..1506e458040d48 100644 --- a/arch/mips/mm/pgtable.c +++ b/arch/mips/mm/pgtable.c @@ -10,10 +10,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { - pgd_t *ret, *init; + pgd_t *init, *ret = NULL; + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, + PGD_TABLE_ORDER); - ret = (pgd_t *) __get_free_pages(GFP_KERNEL, PGD_TABLE_ORDER); - if (ret) { + if (ptdesc) { + ret = ptdesc_address(ptdesc); init = pgd_offset(&init_mm, 0UL); pgd_init(ret); memcpy(ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, From 61139e9a7592edc431e7586aab475a77c957e65c Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:06 -0700 Subject: [PATCH 374/489] nios2: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-25-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Acked-by: Dinh Nguyen Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/nios2/include/asm/pgalloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/nios2/include/asm/pgalloc.h b/arch/nios2/include/asm/pgalloc.h index ecd1657bb2cede..ce6bb8e74271f1 100644 --- a/arch/nios2/include/asm/pgalloc.h +++ b/arch/nios2/include/asm/pgalloc.h @@ -28,10 +28,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, extern pgd_t *pgd_alloc(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ - do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ + do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif /* _ASM_NIOS2_PGALLOC_H */ From 5823b9fe0451869b4c67a920533d47c5bf1e7628 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:07 -0700 Subject: [PATCH 375/489] openrisc: convert __pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-26-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/openrisc/include/asm/pgalloc.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index b7b2b8d16fad54..c6a73772a54663 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -66,10 +66,10 @@ extern inline pgd_t *pgd_alloc(struct mm_struct *mm) extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm); -#define __pte_free_tlb(tlb, pte, addr) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif From 380f2c1ae9d45a1aa19a2b05dbe57371feebb394 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:08 -0700 Subject: [PATCH 376/489] riscv: convert alloc_{pmd, pte}_late() to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Some of the functions use the *get*page*() helper functions. Convert these to use pagetable_alloc() and ptdesc_address() instead to help standardize page tables further. Link: https://lkml.kernel.org/r/20230807230513.102486-27-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Palmer Dabbelt Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgalloc.h | 8 ++++---- arch/riscv/mm/init.c | 16 ++++++---------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h index 59dc12b5b7e8fd..d169a4f41a2e72 100644 --- a/arch/riscv/include/asm/pgalloc.h +++ b/arch/riscv/include/asm/pgalloc.h @@ -153,10 +153,10 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm) #endif /* __PAGETABLE_PMD_FOLDED */ -#define __pte_free_tlb(tlb, pte, buf) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), pte); \ +#define __pte_free_tlb(tlb, pte, buf) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), page_ptdesc(pte));\ } while (0) #endif /* CONFIG_MMU */ diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 9ce504737d1858..430a3d05a841f9 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -353,12 +353,10 @@ static inline phys_addr_t __init alloc_pte_fixmap(uintptr_t va) static phys_addr_t __init alloc_pte_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pte_page_ctor(virt_to_page((void *)vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - return __pa(vaddr); + BUG_ON(!ptdesc || !pagetable_pte_ctor(ptdesc)); + return __pa((pte_t *)ptdesc_address(ptdesc)); } static void __init create_pte_mapping(pte_t *ptep, @@ -436,12 +434,10 @@ static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va) static phys_addr_t __init alloc_pmd_late(uintptr_t va) { - unsigned long vaddr; - - vaddr = __get_free_page(GFP_KERNEL); - BUG_ON(!vaddr || !pgtable_pmd_page_ctor(virt_to_page((void *)vaddr))); + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL & ~__GFP_HIGHMEM, 0); - return __pa(vaddr); + BUG_ON(!ptdesc || !pagetable_pmd_ctor(ptdesc)); + return __pa((pmd_t *)ptdesc_address(ptdesc)); } static void __init create_pmd_mapping(pmd_t *pmdp, From bb3be388537b85b567dd614b492d66f383bc8273 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:09 -0700 Subject: [PATCH 377/489] sh: convert pte_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Also cleans up some spacing issues. Link: https://lkml.kernel.org/r/20230807230513.102486-28-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Geert Uytterhoeven Acked-by: John Paul Adrian Glaubitz Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sh/include/asm/pgalloc.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index a9e98233c4d498..5d8577ab15911e 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -2,6 +2,7 @@ #ifndef __ASM_SH_PGALLOC_H #define __ASM_SH_PGALLOC_H +#include #include #define __HAVE_ARCH_PMD_ALLOC_ONE @@ -31,10 +32,10 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, set_pmd(pmd, __pmd((unsigned long)page_address(pte))); } -#define __pte_free_tlb(tlb,pte,addr) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb), (pte)); \ +#define __pte_free_tlb(tlb, pte, addr) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #endif /* __ASM_SH_PGALLOC_H */ From b3311d707c8f1c1d3b0e45f5c4785e9ca3d723a1 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:10 -0700 Subject: [PATCH 378/489] sparc64: convert various functions to use ptdescs As part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents, convert various page table functions to use ptdescs. Link: https://lkml.kernel.org/r/20230807230513.102486-29-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sparc/mm/init_64.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 0d7fd793924c85..9a63a3e08e40c1 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2893,14 +2893,15 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm) pgtable_t pte_alloc_one(struct mm_struct *mm) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + struct ptdesc *ptdesc = pagetable_alloc(GFP_KERNEL | __GFP_ZERO, 0); + + if (!ptdesc) return NULL; - if (!pgtable_pte_page_ctor(page)) { - __free_page(page); + if (!pagetable_pte_ctor(ptdesc)) { + pagetable_free(ptdesc); return NULL; } - return (pte_t *) page_address(page); + return ptdesc_address(ptdesc); } void pte_free_kernel(struct mm_struct *mm, pte_t *pte) @@ -2910,10 +2911,10 @@ void pte_free_kernel(struct mm_struct *mm, pte_t *pte) static void __pte_free(pgtable_t pte) { - struct page *page = virt_to_page(pte); + struct ptdesc *ptdesc = virt_to_ptdesc(pte); - pgtable_pte_page_dtor(page); - __free_page(page); + pagetable_pte_dtor(ptdesc); + pagetable_free(ptdesc); } void pte_free(struct mm_struct *mm, pgtable_t pte) From 222107e1601f6de9c662784929c0f819cc01fa21 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:11 -0700 Subject: [PATCH 379/489] sparc: convert pgtable_pte_page_{ctor, dtor}() to ptdesc equivalents Part of the conversions to replace pgtable pte constructor/destructors with ptdesc equivalents. Link: https://lkml.kernel.org/r/20230807230513.102486-30-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/sparc/mm/srmmu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 13f027afc875c8..8393faa3e596e4 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -355,7 +355,8 @@ pgtable_t pte_alloc_one(struct mm_struct *mm) return NULL; page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); - if (page_ref_inc_return(page) == 2 && !pgtable_pte_page_ctor(page)) { + if (page_ref_inc_return(page) == 2 && + !pagetable_pte_ctor(page_ptdesc(page))) { page_ref_dec(page); ptep = NULL; } @@ -371,7 +372,7 @@ void pte_free(struct mm_struct *mm, pgtable_t ptep) page = pfn_to_page(__nocache_pa((unsigned long)ptep) >> PAGE_SHIFT); spin_lock(&mm->page_table_lock); if (page_ref_dec_return(page) == 1) - pgtable_pte_page_dtor(page); + pagetable_pte_dtor(page_ptdesc(page)); spin_unlock(&mm->page_table_lock); srmmu_free_nocache(ptep, SRMMU_PTE_TABLE_SIZE); From da9aefca789d753071e3f36fa940da329c11f7f8 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:12 -0700 Subject: [PATCH 380/489] um: convert {pmd, pte}_free_tlb() to use ptdescs Part of the conversions to replace pgtable constructor/destructors with ptdesc equivalents. Also cleans up some spacing issues. Link: https://lkml.kernel.org/r/20230807230513.102486-31-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- arch/um/include/asm/pgalloc.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 8ec7cd46dd9621..de5e31c64793d7 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -25,19 +25,19 @@ */ extern pgd_t *pgd_alloc(struct mm_struct *); -#define __pte_free_tlb(tlb,pte, address) \ -do { \ - pgtable_pte_page_dtor(pte); \ - tlb_remove_page((tlb),(pte)); \ +#define __pte_free_tlb(tlb, pte, address) \ +do { \ + pagetable_pte_dtor(page_ptdesc(pte)); \ + tlb_remove_page_ptdesc((tlb), (page_ptdesc(pte))); \ } while (0) #ifdef CONFIG_3_LEVEL_PGTABLES -#define __pmd_free_tlb(tlb, pmd, address) \ -do { \ - pgtable_pmd_page_dtor(virt_to_page(pmd)); \ - tlb_remove_page((tlb),virt_to_page(pmd)); \ -} while (0) \ +#define __pmd_free_tlb(tlb, pmd, address) \ +do { \ + pagetable_pmd_dtor(virt_to_ptdesc(pmd)); \ + tlb_remove_page_ptdesc((tlb), virt_to_ptdesc(pmd)); \ +} while (0) #endif From 9a4bbd8d975e01a777005e00c0e26d72bb6cc15a Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 7 Aug 2023 16:05:13 -0700 Subject: [PATCH 381/489] mm: remove pgtable_{pmd, pte}_page_{ctor, dtor}() wrappers These functions are no longer necessary. Remove them and cleanup Documentation referencing them. Link: https://lkml.kernel.org/r/20230807230513.102486-32-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Christophe Leroy Cc: Claudio Imbrenda Cc: Dave Hansen Cc: David Hildenbrand Cc: "David S. Miller" Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Huacai Chen Cc: Hugh Dickins Cc: John Paul Adrian Glaubitz Cc: Jonas Bonn Cc: Matthew Wilcox Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Weinberger Cc: Thomas Bogendoerfer Cc: Yoshinori Sato Signed-off-by: Andrew Morton --- Documentation/mm/split_page_table_lock.rst | 12 +++++------ .../zh_CN/mm/split_page_table_lock.rst | 14 ++++++------- include/linux/mm.h | 20 ------------------- 3 files changed, 13 insertions(+), 33 deletions(-) diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index a834fad9de1209..e4f6972eb6c04b 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -58,7 +58,7 @@ Support of split page table lock by an architecture =================================================== There's no need in special enabling of PTE split page table lock: everything -required is done by pgtable_pte_page_ctor() and pgtable_pte_page_dtor(), which +required is done by pagetable_pte_ctor() and pagetable_pte_dtor(), which must be called on PTE table allocation / freeing. Make sure the architecture doesn't use slab allocator for page table @@ -68,8 +68,8 @@ This field shares storage with page->ptl. PMD split lock only makes sense if you have more than two page table levels. -PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table -allocation and pgtable_pmd_page_dtor() on freeing. +PMD split lock enabling requires pagetable_pmd_ctor() call on PMD table +allocation and pagetable_pmd_dtor() on freeing. Allocation usually happens in pmd_alloc_one(), freeing in pmd_free() and pmd_free_tlb(), but make sure you cover all PMD table allocation / freeing @@ -77,7 +77,7 @@ paths: i.e X86_PAE preallocate few PMDs on pgd_alloc(). With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. -NOTE: pgtable_pte_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +NOTE: pagetable_pte_ctor() and pagetable_pmd_ctor() can fail -- it must be handled properly. page->ptl @@ -97,7 +97,7 @@ trick: split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs one more cache line for indirect access; -The spinlock_t allocated in pgtable_pte_page_ctor() for PTE table and in -pgtable_pmd_page_ctor() for PMD table. +The spinlock_t allocated in pagetable_pte_ctor() for PTE table and in +pagetable_pmd_ctor() for PMD table. Please, never access page->ptl directly -- use appropriate helper. diff --git a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst index 4fb7aa666037ca..a2c288670a2469 100644 --- a/Documentation/translations/zh_CN/mm/split_page_table_lock.rst +++ b/Documentation/translations/zh_CN/mm/split_page_table_lock.rst @@ -56,16 +56,16 @@ Hugetlb特定的辅助函数: 架构对分页表锁的支持 ==================== -没有必要特别启用PTE分页表锁:所有需要的东西都由pgtable_pte_page_ctor() -和pgtable_pte_page_dtor()完成,它们必须在PTE表分配/释放时被调用。 +没有必要特别启用PTE分页表锁:所有需要的东西都由pagetable_pte_ctor() +和pagetable_pte_dtor()完成,它们必须在PTE表分配/释放时被调用。 确保架构不使用slab分配器来分配页表:slab使用page->slab_cache来分配其页 面。这个区域与page->ptl共享存储。 PMD分页锁只有在你有两个以上的页表级别时才有意义。 -启用PMD分页锁需要在PMD表分配时调用pgtable_pmd_page_ctor(),在释放时调 -用pgtable_pmd_page_dtor()。 +启用PMD分页锁需要在PMD表分配时调用pagetable_pmd_ctor(),在释放时调 +用pagetable_pmd_dtor()。 分配通常发生在pmd_alloc_one()中,释放发生在pmd_free()和pmd_free_tlb() 中,但要确保覆盖所有的PMD表分配/释放路径:即X86_PAE在pgd_alloc()中预先 @@ -73,7 +73,7 @@ PMD分页锁只有在你有两个以上的页表级别时才有意义。 一切就绪后,你可以设置CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK。 -注意:pgtable_pte_page_ctor()和pgtable_pmd_page_ctor()可能失败--必 +注意:pagetable_pte_ctor()和pagetable_pmd_ctor()可能失败--必 须正确处理。 page->ptl @@ -90,7 +90,7 @@ page->ptl用于访问分割页表锁,其中'page'是包含该表的页面struc 的指针并动态分配它。这允许在启用DEBUG_SPINLOCK或DEBUG_LOCK_ALLOC的 情况下使用分页锁,但由于间接访问而多花了一个缓存行。 -PTE表的spinlock_t分配在pgtable_pte_page_ctor()中,PMD表的spinlock_t -分配在pgtable_pmd_page_ctor()中。 +PTE表的spinlock_t分配在pagetable_pte_ctor()中,PMD表的spinlock_t +分配在pagetable_pmd_ctor()中。 请不要直接访问page->ptl - -使用适当的辅助函数。 diff --git a/include/linux/mm.h b/include/linux/mm.h index 6fdc294ada0d8a..88ac4cf9d3ecac 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2898,11 +2898,6 @@ static inline bool pagetable_pte_ctor(struct ptdesc *ptdesc) return true; } -static inline bool pgtable_pte_page_ctor(struct page *page) -{ - return pagetable_pte_ctor(page_ptdesc(page)); -} - static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -2912,11 +2907,6 @@ static inline void pagetable_pte_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -static inline void pgtable_pte_page_dtor(struct page *page) -{ - pagetable_pte_dtor(page_ptdesc(page)); -} - pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp); static inline pte_t *pte_offset_map(pmd_t *pmd, unsigned long addr) { @@ -3023,11 +3013,6 @@ static inline bool pagetable_pmd_ctor(struct ptdesc *ptdesc) return true; } -static inline bool pgtable_pmd_page_ctor(struct page *page) -{ - return pagetable_pmd_ctor(page_ptdesc(page)); -} - static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) { struct folio *folio = ptdesc_folio(ptdesc); @@ -3037,11 +3022,6 @@ static inline void pagetable_pmd_dtor(struct ptdesc *ptdesc) lruvec_stat_sub_folio(folio, NR_PAGETABLE); } -static inline void pgtable_pmd_page_dtor(struct page *page) -{ - pagetable_pmd_dtor(page_ptdesc(page)); -} - /* * No scalability reason to split PUD locks yet, but follow the same pattern * as the PMD locks to make it easier if we decide to. The VM should not be From 708879a1b44216f6c12a3d61328c5259078fc1b1 Mon Sep 17 00:00:00 2001 From: Rong Tao Date: Mon, 14 Aug 2023 18:45:50 +0800 Subject: [PATCH 382/489] selftests/mm: fix uffd-stress help information commit 686a8bb72349("selftests/mm: split uffd tests into uffd-stress and uffd-unit-tests") split uffd tests into uffd-stress and uffd-unit-tests, obviously we need to modify the help information synchronously. Also modify code indentation. Link: https://lkml.kernel.org/r/tencent_64FC724AC5F05568F41BD1C68058E83CEB05@qq.com Signed-off-by: Rong Tao Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/uffd-stress.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index 73ebb97c70264a..469e0476af26bb 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -53,21 +53,21 @@ pthread_attr_t attr; do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) const char *examples = - "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" - "./userfaultfd anon 100 99999\n\n" - "# Run share memory test on 1GiB region with 99 bounces:\n" - "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" - "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using private file:\n" - "./userfaultfd hugetlb-private 256 50\n\n" - "# 10MiB-~6GiB 999 bounces anonymous test, " - "continue forever unless an error triggers\n" - "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./uffd-stress anon 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./uffd-stress shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./uffd-stress hugetlb 256 50\n\n" + "# Run the same hugetlb test but using private file:\n" + "./uffd-stress hugetlb-private 256 50\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./uffd-stress anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; static void usage(void) { - fprintf(stderr, "\nUsage: ./userfaultfd \n\n"); + fprintf(stderr, "\nUsage: ./uffd-stress \n\n"); fprintf(stderr, "Supported : anon, hugetlb, " "hugetlb-private, shmem, shmem-private\n\n"); fprintf(stderr, "Examples:\n\n"); From 99f34659e78b9b781a3248e0b080b4dfca4957e2 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:40:57 +1000 Subject: [PATCH 383/489] selftests: memfd: error out test process when child test fails Patch series "memfd: cleanups for vm.memfd_noexec", v2. The most critical issue with vm.memfd_noexec=2 (the fact that passing MFD_EXEC would bypass it entirely[1]) has been fixed in Andrew's tree[2], but there are still some outstanding issues that need to be addressed: * vm.memfd_noexec=2 shouldn't reject old-style memfd_create(2) syscalls because it will make it far to difficult to ever migrate. Instead it should imply MFD_EXEC. * The dmesg warnings are pr_warn_once(), which on most systems means that they will be used up by systemd or some other boot process and userspace developers will never see it. - For the !(flags & (MFD_EXEC | MFD_NOEXEC_SEAL)) case, outputting a rate-limited message to the kernel log is necessary to tell userspace that they should add the new flags. Arguably the most ideal way to deal with the spam concern[3,4] while still prompting userspace to switch to the new flags would be to only log the warning once per task or something similar. However, adding something to task_struct for tracking this would be needless bloat for a single pr_warn_ratelimited(). So just switch to pr_info_ratelimited() to avoid spamming the log with something that isn't a real warning. There's lots of info-level stuff in dmesg, it seems really unlikely that this should be an actual problem. Most programs are already switching to the new flags anyway. - For the vm.memfd_noexec=2 case, we need to log a warning for every failure because otherwise userspace will have no idea why their previously working program started returning -EACCES (previously -EINVAL) from memfd_create(2). pr_warn_once() is simply wrong here. * The racheting mechanism for vm.memfd_noexec makes it incredibly unappealing for most users to enable the sysctl because enabling it on &init_pid_ns means you need a system reboot to unset it. Given the actual security threat being protected against, CAP_SYS_ADMIN users being restricted in this way makes little sense. The argument for this ratcheting by the original author was that it allows you to have a hierarchical setting that cannot be unset by child pidnses, but this is not accurate -- changing the parent pidns's vm.memfd_noexec setting to be more restrictive didn't affect children. Instead, switch the vm.memfd_noexec sysctl to be properly hierarchical and allow CAP_SYS_ADMIN users (in the pidns's owning userns) to lower the setting as long as it is not lower than the parent's effective setting. This change also makes it so that changing a parent pidns's vm.memfd_noexec will affect all descendants, providing a properly hierarchical setting. The performance impact of this is incredibly minimal since the maximum depth of pidns is 32 and it is only checked during memfd_create(2) and unshare(CLONE_NEWPID). * The memfd selftests would not exit with a non-zero error code when certain tests that ran in a forked process (specifically the ones related to MFD_EXEC and MFD_NOEXEC_SEAL) failed. [1]: https://lore.kernel.org/all/ZJwcsU0vI-nzgOB_@codewreck.org/ [2]: https://lore.kernel.org/all/20230705063315.3680666-1-jeffxu@google.com/ [3]: https://lore.kernel.org/Y5yS8wCnuYGLHMj4@x1n/ [4]: https://lore.kernel.org/f185bb42-b29c-977e-312e-3349eea15383@linuxfoundation.org/ This patch (of 5): Before this change, a test runner using this self test would see a return code of 0 when the tests using a child process (namely the MFD_NOEXEC_SEAL and MFD_EXEC tests) failed, masking test failures. Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-0-7ff9e3e10ba6@cyphar.com Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-1-7ff9e3e10ba6@cyphar.com Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC") Signed-off-by: Aleksa Sarai Reviewed-by: Jeff Xu Cc: "Christian Brauner (Microsoft)" Cc: Daniel Verkamp Cc: Dominique Martinet Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index dbdd9ec5e3973f..8eb49204f9eac8 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1207,7 +1207,24 @@ static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)) static void join_newpid_thread(pid_t pid) { - waitpid(pid, NULL, 0); + int wstatus; + + if (waitpid(pid, &wstatus, 0) < 0) { + printf("newpid thread: waitpid() failed: %m\n"); + abort(); + } + + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != 0) { + printf("newpid thread: exited with non-zero error code %d\n", + WEXITSTATUS(wstatus)); + abort(); + } + + if (WIFSIGNALED(wstatus)) { + printf("newpid thread: killed by signal %d\n", + WTERMSIG(wstatus)); + abort(); + } } /* From 202e14222fadb246dfdf182e67de1518e86a1e20 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:40:58 +1000 Subject: [PATCH 384/489] memfd: do not -EACCES old memfd_create() users with vm.memfd_noexec=2 Given the difficulty of auditing all of userspace to figure out whether every memfd_create() user has switched to passing MFD_EXEC and MFD_NOEXEC_SEAL flags, it seems far less distruptive to make it possible for older programs that don't make use of executable memfds to run under vm.memfd_noexec=2. Otherwise, a small dependency change can result in spurious errors. For programs that don't use executable memfds, passing MFD_NOEXEC_SEAL is functionally a no-op and thus having the same In addition, every failure under vm.memfd_noexec=2 needs to print to the kernel log so that userspace can figure out where the error came from. The concerns about pr_warn_ratelimited() spam that caused the switch to pr_warn_once()[1,2] do not apply to the vm.memfd_noexec=2 case. This is a user-visible API change, but as it allows programs to do something that would be blocked before, and the sysctl itself was broken and recently released, it seems unlikely this will cause any issues. [1]: https://lore.kernel.org/Y5yS8wCnuYGLHMj4@x1n/ [2]: https://lore.kernel.org/202212161233.85C9783FB@keescook/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-2-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Dominique Martinet Cc: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 16 +++--------- mm/memfd.c | 30 ++++++++-------------- tools/testing/selftests/memfd/memfd_test.c | 22 ++++++++++++---- 3 files changed, 32 insertions(+), 36 deletions(-) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index c758809d5bcf3f..53974d79d98e8a 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -17,18 +17,10 @@ struct fs_pin; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -/* - * sysctl for vm.memfd_noexec - * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL - * acts like MFD_EXEC was set. - * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL - * acts like MFD_NOEXEC_SEAL was set. - * 2: memfd_create() without MFD_NOEXEC_SEAL will be - * rejected. - */ -#define MEMFD_NOEXEC_SCOPE_EXEC 0 -#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 -#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +/* modes for vm.memfd_noexec sysctl */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 /* MFD_EXEC implied if unset */ +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 /* MFD_NOEXEC_SEAL implied if unset */ +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 /* same as 1, except MFD_EXEC rejected */ #endif struct pid_namespace { diff --git a/mm/memfd.c b/mm/memfd.c index 0bdbd2335af751..d65485c762defa 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -271,30 +271,22 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL - char comm[TASK_COMM_LEN]; - int sysctl = MEMFD_NOEXEC_SCOPE_EXEC; - struct pid_namespace *ns; - - ns = task_active_pid_ns(current); - if (ns) - sysctl = ns->memfd_noexec_scope; + int sysctl = task_active_pid_ns(current)->memfd_noexec_scope; if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { - if (sysctl == MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) + if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) *flags |= MFD_NOEXEC_SEAL; else *flags |= MFD_EXEC; } - if (*flags & MFD_EXEC && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { - pr_warn_once( - "memfd_create(): MFD_NOEXEC_SEAL is enforced, pid=%d '%s'\n", - task_pid_nr(current), get_task_comm(comm, current)); - + if (!(*flags & MFD_NOEXEC_SEAL) && sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED) { + pr_err_ratelimited( + "%s[%d]: memfd_create() requires MFD_NOEXEC_SEAL with vm.memfd_noexec=%d\n", + current->comm, task_pid_nr(current), sysctl); return -EACCES; } #endif - return 0; } @@ -302,7 +294,6 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - char comm[TASK_COMM_LEN]; unsigned int *file_seals; struct file *file; int fd, error; @@ -325,12 +316,13 @@ SYSCALL_DEFINE2(memfd_create, if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { pr_warn_once( - "memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL, pid=%d '%s'\n", - task_pid_nr(current), get_task_comm(comm, current)); + "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n", + current->comm, task_pid_nr(current)); } - if (check_sysctl_memfd_noexec(&flags) < 0) - return -EACCES; + error = check_sysctl_memfd_noexec(&flags); + if (error < 0) + return error; /* length includes terminating zero */ len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 8eb49204f9eac8..8b7390ad81d11b 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -1145,11 +1145,23 @@ static void test_sysctl_child(void) printf("%s sysctl 2\n", memfd_str); sysctl_assert_write("2"); - mfd_fail_new("kern_memfd_sysctl_2", - MFD_CLOEXEC | MFD_ALLOW_SEALING); - mfd_fail_new("kern_memfd_sysctl_2_MFD_EXEC", - MFD_CLOEXEC | MFD_EXEC); - fd = mfd_assert_new("", 0, MFD_NOEXEC_SEAL); + mfd_fail_new("kern_memfd_sysctl_2_exec", + MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); + + fd = mfd_assert_new("kern_memfd_sysctl_2_dfl", + mfd_def_size, + MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); + + fd = mfd_assert_new("kern_memfd_sysctl_2_noexec_seal", + mfd_def_size, + MFD_NOEXEC_SEAL | MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); close(fd); sysctl_fail_write("0"); From 434ed3350f57c03a9654fe0619755cc137a58935 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:40:59 +1000 Subject: [PATCH 385/489] memfd: improve userspace warnings for missing exec-related flags In order to incentivise userspace to switch to passing MFD_EXEC and MFD_NOEXEC_SEAL, we need to provide a warning on each attempt to call memfd_create() without the new flags. pr_warn_once() is not useful because on most systems the one warning is burned up during the boot process (on my system, systemd does this within the first second of boot) and thus userspace will in practice never see the warnings to push them to switch to the new flags. The original patchset[1] used pr_warn_ratelimited(), however there were concerns about the degree of spam in the kernel log[2,3]. The resulting inability to detect every case was flagged as an issue at the time[4]. While we could come up with an alternative rate-limiting scheme such as only outputting the message if vm.memfd_noexec has been modified, or only outputting the message once for a given task, these alternatives have downsides that don't make sense given how low-stakes a single kernel warning message is. Switching to pr_info_ratelimited() instead should be fine -- it's possible some monitoring tool will be unhappy with a stream of warning-level messages but there's already plenty of info-level message spam in dmesg. [1]: https://lore.kernel.org/20221215001205.51969-4-jeffxu@google.com/ [2]: https://lore.kernel.org/202212161233.85C9783FB@keescook/ [3]: https://lore.kernel.org/Y5yS8wCnuYGLHMj4@x1n/ [4]: https://lore.kernel.org/f185bb42-b29c-977e-312e-3349eea15383@linuxfoundation.org/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-3-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Christian Brauner Cc: Daniel Verkamp Cc: Dominique Martinet Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memfd.c b/mm/memfd.c index d65485c762defa..aa46521057ab13 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -315,7 +315,7 @@ SYSCALL_DEFINE2(memfd_create, return -EINVAL; if (!(flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { - pr_warn_once( + pr_info_ratelimited( "%s[%d]: memfd_create() called without MFD_EXEC or MFD_NOEXEC_SEAL set\n", current->comm, task_pid_nr(current)); } From 9876cfe8ec1cb3c88de31f4d58d57b0e7e22bcc4 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:41:00 +1000 Subject: [PATCH 386/489] memfd: replace ratcheting feature from vm.memfd_noexec with hierarchy This sysctl has the very unusual behaviour of not allowing any user (even CAP_SYS_ADMIN) to reduce the restriction setting, meaning that if you were to set this sysctl to a more restrictive option in the host pidns you would need to reboot your machine in order to reset it. The justification given in [1] is that this is a security feature and thus it should not be possible to disable. Aside from the fact that we have plenty of security-related sysctls that can be disabled after being enabled (fs.protected_symlinks for instance), the protection provided by the sysctl is to stop users from being able to create a binary and then execute it. A user with CAP_SYS_ADMIN can trivially do this without memfd_create(2): % cat mount-memfd.c #include #include #include #include #include #include #define SHELLCODE "#!/bin/echo this file was executed from this totally private tmpfs:" int main(void) { int fsfd = fsopen("tmpfs", FSOPEN_CLOEXEC); assert(fsfd >= 0); assert(!fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 2)); int dfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); assert(dfd >= 0); int execfd = openat(dfd, "exe", O_CREAT | O_RDWR | O_CLOEXEC, 0782); assert(execfd >= 0); assert(write(execfd, SHELLCODE, strlen(SHELLCODE)) == strlen(SHELLCODE)); assert(!close(execfd)); char *execpath = NULL; char *argv[] = { "bad-exe", NULL }, *envp[] = { NULL }; execfd = openat(dfd, "exe", O_PATH | O_CLOEXEC); assert(execfd >= 0); assert(asprintf(&execpath, "/proc/self/fd/%d", execfd) > 0); assert(!execve(execpath, argv, envp)); } % ./mount-memfd this file was executed from this totally private tmpfs: /proc/self/fd/5 % Given that it is possible for CAP_SYS_ADMIN users to create executable binaries without memfd_create(2) and without touching the host filesystem (not to mention the many other things a CAP_SYS_ADMIN process would be able to do that would be equivalent or worse), it seems strange to cause a fair amount of headache to admins when there doesn't appear to be an actual security benefit to blocking this. There appear to be concerns about confused-deputy-esque attacks[2] but a confused deputy that can write to arbitrary sysctls is a bigger security issue than executable memfds. /* New API */ The primary requirement from the original author appears to be more based on the need to be able to restrict an entire system in a hierarchical manner[3], such that child namespaces cannot re-enable executable memfds. So, implement that behaviour explicitly -- the vm.memfd_noexec scope is evaluated up the pidns tree to &init_pid_ns and you have the most restrictive value applied to you. The new lower limit you can set vm.memfd_noexec is whatever limit applies to your parent. Note that a pidns will inherit a copy of the parent pidns's effective vm.memfd_noexec setting at unshare() time. This matches the existing behaviour, and it also ensures that a pidns will never have its vm.memfd_noexec setting *lowered* behind its back (but it will be raised if the parent raises theirs). /* Backwards Compatibility */ As the previous version of the sysctl didn't allow you to lower the setting at all, there are no backwards compatibility issues with this aspect of the change. However it should be noted that now that the setting is completely hierarchical. Previously, a cloned pidns would just copy the current pidns setting, meaning that if the parent's vm.memfd_noexec was changed it wouldn't propoagate to existing pid namespaces. Now, the restriction applies recursively. This is a uAPI change, however: * The sysctl is very new, having been merged in 6.3. * Several aspects of the sysctl were broken up until this patchset and the other patchset by Jeff Xu last month. And thus it seems incredibly unlikely that any real users would run into this issue. In the worst case, if this causes userspace isues we could make it so that modifying the setting follows the hierarchical rules but the restriction checking uses the cached copy. [1]: https://lore.kernel.org/CABi2SkWnAgHK1i6iqSqPMYuNEhtHBkO8jUuCvmG3RmUB5TKHJw@mail.gmail.com/ [2]: https://lore.kernel.org/CALmYWFs_dNCzw_pW1yRAo4bGCPEtykroEQaowNULp7svwMLjOg@mail.gmail.com/ [3]: https://lore.kernel.org/CALmYWFuahdUF7cT4cm7_TGLqPanuHXJ-hVSfZt7vpTnc18DPrw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-4-7ff9e3e10ba6@cyphar.com Fixes: 105ff5339f49 ("mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC") Signed-off-by: Aleksa Sarai Cc: Dominique Martinet Cc: Christian Brauner Cc: Daniel Verkamp Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 23 ++++++++++++++++++++++- kernel/pid.c | 3 +++ kernel/pid_namespace.c | 6 +++--- kernel/pid_sysctl.h | 28 ++++++++++++---------------- mm/memfd.c | 3 ++- 5 files changed, 42 insertions(+), 21 deletions(-) diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 53974d79d98e8a..f9f9931e02d6ad 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -39,7 +39,6 @@ struct pid_namespace { int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) - /* sysctl for vm.memfd_noexec */ int memfd_noexec_scope; #endif } __randomize_layout; @@ -56,6 +55,23 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + int scope = MEMFD_NOEXEC_SCOPE_EXEC; + + for (; ns; ns = ns->parent) + scope = max(scope, READ_ONCE(ns->memfd_noexec_scope)); + + return scope; +} +#else +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + return 0; +} +#endif + extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); @@ -70,6 +86,11 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) return ns; } +static inline int pidns_memfd_noexec_scope(struct pid_namespace *ns) +{ + return 0; +} + static inline struct pid_namespace *copy_pid_ns(unsigned long flags, struct user_namespace *user_ns, struct pid_namespace *ns) { diff --git a/kernel/pid.c b/kernel/pid.c index 6a1d23a11026c0..fee14a4486a310 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -83,6 +83,9 @@ struct pid_namespace init_pid_ns = { #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, +#endif }; EXPORT_SYMBOL_GPL(init_pid_ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 0bf44afe04dd18..619972c78774f7 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -110,9 +110,9 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; - - initialize_memfd_noexec_scope(ns); - +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + ns->memfd_noexec_scope = pidns_memfd_noexec_scope(parent_pid_ns); +#endif return ns; out_free_idr: diff --git a/kernel/pid_sysctl.h b/kernel/pid_sysctl.h index b26e027fc9cd4e..2ee41a3a1dfdee 100644 --- a/kernel/pid_sysctl.h +++ b/kernel/pid_sysctl.h @@ -5,33 +5,30 @@ #include #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) -static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) -{ - ns->memfd_noexec_scope = - task_active_pid_ns(current)->memfd_noexec_scope; -} - static int pid_mfd_noexec_dointvec_minmax(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos) { struct pid_namespace *ns = task_active_pid_ns(current); struct ctl_table table_copy; + int err, scope, parent_scope; if (write && !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; table_copy = *table; - if (ns != &init_pid_ns) - table_copy.data = &ns->memfd_noexec_scope; - /* - * set minimum to current value, the effect is only bigger - * value is accepted. - */ - if (*(int *)table_copy.data > *(int *)table_copy.extra1) - table_copy.extra1 = table_copy.data; + /* You cannot set a lower enforcement value than your parent. */ + parent_scope = pidns_memfd_noexec_scope(ns->parent); + /* Equivalent to pidns_memfd_noexec_scope(ns). */ + scope = max(READ_ONCE(ns->memfd_noexec_scope), parent_scope); + + table_copy.data = &scope; + table_copy.extra1 = &parent_scope; - return proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); + err = proc_dointvec_minmax(&table_copy, write, buf, lenp, ppos); + if (!err && write) + WRITE_ONCE(ns->memfd_noexec_scope, scope); + return err; } static struct ctl_table pid_ns_ctl_table_vm[] = { @@ -51,7 +48,6 @@ static inline void register_pid_ns_sysctl_table_vm(void) register_sysctl("vm", pid_ns_ctl_table_vm); } #else -static inline void initialize_memfd_noexec_scope(struct pid_namespace *ns) {} static inline void register_pid_ns_sysctl_table_vm(void) {} #endif diff --git a/mm/memfd.c b/mm/memfd.c index aa46521057ab13..1cad1904fc26b0 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -271,7 +271,8 @@ long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) static int check_sysctl_memfd_noexec(unsigned int *flags) { #ifdef CONFIG_SYSCTL - int sysctl = task_active_pid_ns(current)->memfd_noexec_scope; + struct pid_namespace *ns = task_active_pid_ns(current); + int sysctl = pidns_memfd_noexec_scope(ns); if (!(*flags & (MFD_EXEC | MFD_NOEXEC_SEAL))) { if (sysctl >= MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL) From 6469b66e3f5a38214bf1d1220d54d78d9cd08ebd Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Mon, 14 Aug 2023 18:41:01 +1000 Subject: [PATCH 387/489] selftests: improve vm.memfd_noexec sysctl tests This adds proper tests for the nesting functionality of vm.memfd_noexec as well as some minor cleanups to spawn_*_thread(). Link: https://lkml.kernel.org/r/20230814-memfd-vm-noexec-uapi-fixes-v2-5-7ff9e3e10ba6@cyphar.com Signed-off-by: Aleksa Sarai Cc: Christian Brauner Cc: Daniel Verkamp Cc: Dominique Martinet Cc: Jeff Xu Cc: Kees Cook Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/memfd/memfd_test.c | 339 +++++++++++++++------ 1 file changed, 254 insertions(+), 85 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 8b7390ad81d11b..3df00867723910 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "common.h" @@ -43,7 +44,6 @@ */ static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; -static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)); static int newpid_thread_fn2(void *arg); static void join_newpid_thread(pid_t pid); @@ -96,12 +96,12 @@ static void sysctl_assert_write(const char *val) int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); if (fd < 0) { - printf("open sysctl failed\n"); + printf("open sysctl failed: %m\n"); abort(); } if (write(fd, val, strlen(val)) < 0) { - printf("write sysctl failed\n"); + printf("write sysctl %s failed: %m\n", val); abort(); } } @@ -111,7 +111,7 @@ static void sysctl_fail_write(const char *val) int fd = open("/proc/sys/vm/memfd_noexec", O_WRONLY | O_CLOEXEC); if (fd < 0) { - printf("open sysctl failed\n"); + printf("open sysctl failed: %m\n"); abort(); } @@ -122,6 +122,33 @@ static void sysctl_fail_write(const char *val) } } +static void sysctl_assert_equal(const char *val) +{ + char *p, buf[128] = {}; + int fd = open("/proc/sys/vm/memfd_noexec", O_RDONLY | O_CLOEXEC); + + if (fd < 0) { + printf("open sysctl failed: %m\n"); + abort(); + } + + if (read(fd, buf, sizeof(buf)) < 0) { + printf("read sysctl failed: %m\n"); + abort(); + } + + /* Strip trailing whitespace. */ + p = buf; + while (!isspace(*p)) + p++; + *p = '\0'; + + if (strcmp(buf, val) != 0) { + printf("unexpected sysctl value: expected %s, got %s\n", val, buf); + abort(); + } +} + static int mfd_assert_reopen_fd(int fd_in) { int fd; @@ -736,7 +763,7 @@ static int idle_thread_fn(void *arg) return 0; } -static pid_t spawn_idle_thread(unsigned int flags) +static pid_t spawn_thread(unsigned int flags, int (*fn)(void *), void *arg) { uint8_t *stack; pid_t pid; @@ -747,10 +774,7 @@ static pid_t spawn_idle_thread(unsigned int flags) abort(); } - pid = clone(idle_thread_fn, - stack + STACK_SIZE, - SIGCHLD | flags, - NULL); + pid = clone(fn, stack + STACK_SIZE, SIGCHLD | flags, arg); if (pid < 0) { printf("clone() failed: %m\n"); abort(); @@ -759,6 +783,33 @@ static pid_t spawn_idle_thread(unsigned int flags) return pid; } +static void join_thread(pid_t pid) +{ + int wstatus; + + if (waitpid(pid, &wstatus, 0) < 0) { + printf("newpid thread: waitpid() failed: %m\n"); + abort(); + } + + if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != 0) { + printf("newpid thread: exited with non-zero error code %d\n", + WEXITSTATUS(wstatus)); + abort(); + } + + if (WIFSIGNALED(wstatus)) { + printf("newpid thread: killed by signal %d\n", + WTERMSIG(wstatus)); + abort(); + } +} + +static pid_t spawn_idle_thread(unsigned int flags) +{ + return spawn_thread(flags, idle_thread_fn, NULL); +} + static void join_idle_thread(pid_t pid) { kill(pid, SIGTERM); @@ -1111,42 +1162,69 @@ static void test_noexec_seal(void) close(fd); } -static void test_sysctl_child(void) +static void test_sysctl_sysctl0(void) { int fd; - int pid; - printf("%s sysctl 0\n", memfd_str); - sysctl_assert_write("0"); - fd = mfd_assert_new("kern_memfd_sysctl_0", + sysctl_assert_equal("0"); + + fd = mfd_assert_new("kern_memfd_sysctl_0_dfl", mfd_def_size, MFD_CLOEXEC | MFD_ALLOW_SEALING); - mfd_assert_mode(fd, 0777); mfd_assert_has_seals(fd, 0); mfd_assert_chmod(fd, 0644); close(fd); +} - printf("%s sysctl 1\n", memfd_str); - sysctl_assert_write("1"); - fd = mfd_assert_new("kern_memfd_sysctl_1", +static void test_sysctl_set_sysctl0(void) +{ + sysctl_assert_write("0"); + test_sysctl_sysctl0(); +} + +static void test_sysctl_sysctl1(void) +{ + int fd; + + sysctl_assert_equal("1"); + + fd = mfd_assert_new("kern_memfd_sysctl_1_dfl", mfd_def_size, MFD_CLOEXEC | MFD_ALLOW_SEALING); + mfd_assert_mode(fd, 0666); + mfd_assert_has_seals(fd, F_SEAL_EXEC); + mfd_fail_chmod(fd, 0777); + close(fd); - printf("%s child ns\n", memfd_str); - pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn2); - join_newpid_thread(pid); + fd = mfd_assert_new("kern_memfd_sysctl_1_exec", + mfd_def_size, + MFD_CLOEXEC | MFD_EXEC | MFD_ALLOW_SEALING); + mfd_assert_mode(fd, 0777); + mfd_assert_has_seals(fd, 0); + mfd_assert_chmod(fd, 0644); + close(fd); + fd = mfd_assert_new("kern_memfd_sysctl_1_noexec", + mfd_def_size, + MFD_CLOEXEC | MFD_NOEXEC_SEAL | MFD_ALLOW_SEALING); mfd_assert_mode(fd, 0666); mfd_assert_has_seals(fd, F_SEAL_EXEC); mfd_fail_chmod(fd, 0777); - sysctl_fail_write("0"); close(fd); +} - printf("%s sysctl 2\n", memfd_str); - sysctl_assert_write("2"); - mfd_fail_new("kern_memfd_sysctl_2_exec", - MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING); +static void test_sysctl_set_sysctl1(void) +{ + sysctl_assert_write("1"); + test_sysctl_sysctl1(); +} + +static void test_sysctl_sysctl2(void) +{ + int fd; + + sysctl_assert_equal("2"); fd = mfd_assert_new("kern_memfd_sysctl_2_dfl", mfd_def_size, @@ -1156,98 +1234,188 @@ static void test_sysctl_child(void) mfd_fail_chmod(fd, 0777); close(fd); - fd = mfd_assert_new("kern_memfd_sysctl_2_noexec_seal", + mfd_fail_new("kern_memfd_sysctl_2_exec", + MFD_CLOEXEC | MFD_EXEC | MFD_ALLOW_SEALING); + + fd = mfd_assert_new("kern_memfd_sysctl_2_noexec", mfd_def_size, - MFD_NOEXEC_SEAL | MFD_CLOEXEC | MFD_ALLOW_SEALING); + MFD_CLOEXEC | MFD_NOEXEC_SEAL | MFD_ALLOW_SEALING); mfd_assert_mode(fd, 0666); mfd_assert_has_seals(fd, F_SEAL_EXEC); mfd_fail_chmod(fd, 0777); close(fd); - - sysctl_fail_write("0"); - sysctl_fail_write("1"); } -static int newpid_thread_fn(void *arg) +static void test_sysctl_set_sysctl2(void) { - test_sysctl_child(); - return 0; + sysctl_assert_write("2"); + test_sysctl_sysctl2(); } -static void test_sysctl_child2(void) +static int sysctl_simple_child(void *arg) { int fd; + int pid; - sysctl_fail_write("0"); - fd = mfd_assert_new("kern_memfd_sysctl_1", - mfd_def_size, - MFD_CLOEXEC | MFD_ALLOW_SEALING); + printf("%s sysctl 0\n", memfd_str); + test_sysctl_set_sysctl0(); - mfd_assert_mode(fd, 0666); - mfd_assert_has_seals(fd, F_SEAL_EXEC); - mfd_fail_chmod(fd, 0777); - close(fd); + printf("%s sysctl 1\n", memfd_str); + test_sysctl_set_sysctl1(); + + printf("%s sysctl 0\n", memfd_str); + test_sysctl_set_sysctl0(); + + printf("%s sysctl 2\n", memfd_str); + test_sysctl_set_sysctl2(); + + printf("%s sysctl 1\n", memfd_str); + test_sysctl_set_sysctl1(); + + printf("%s sysctl 0\n", memfd_str); + test_sysctl_set_sysctl0(); + + return 0; +} + +/* + * Test sysctl + * A very basic test to make sure the core sysctl semantics work. + */ +static void test_sysctl_simple(void) +{ + int pid = spawn_thread(CLONE_NEWPID, sysctl_simple_child, NULL); + + join_thread(pid); } -static int newpid_thread_fn2(void *arg) +static int sysctl_nested(void *arg) { - test_sysctl_child2(); + void (*fn)(void) = arg; + + fn(); return 0; } -static pid_t spawn_newpid_thread(unsigned int flags, int (*fn)(void *)) + +static int sysctl_nested_wait(void *arg) { - uint8_t *stack; - pid_t pid; + /* Wait for a SIGCONT. */ + kill(getpid(), SIGSTOP); + return sysctl_nested(arg); +} - stack = malloc(STACK_SIZE); - if (!stack) { - printf("malloc(STACK_SIZE) failed: %m\n"); - abort(); - } +static void test_sysctl_sysctl1_failset(void) +{ + sysctl_fail_write("0"); + test_sysctl_sysctl1(); +} - pid = clone(fn, - stack + STACK_SIZE, - SIGCHLD | flags, - NULL); - if (pid < 0) { - printf("clone() failed: %m\n"); - abort(); - } +static void test_sysctl_sysctl2_failset(void) +{ + sysctl_fail_write("1"); + test_sysctl_sysctl2(); - return pid; + sysctl_fail_write("0"); + test_sysctl_sysctl2(); } -static void join_newpid_thread(pid_t pid) +static int sysctl_nested_child(void *arg) { - int wstatus; + int fd; + int pid; - if (waitpid(pid, &wstatus, 0) < 0) { - printf("newpid thread: waitpid() failed: %m\n"); - abort(); - } + printf("%s nested sysctl 0\n", memfd_str); + sysctl_assert_write("0"); + /* A further nested pidns works the same. */ + pid = spawn_thread(CLONE_NEWPID, sysctl_simple_child, NULL); + join_thread(pid); - if (WIFEXITED(wstatus) && WEXITSTATUS(wstatus) != 0) { - printf("newpid thread: exited with non-zero error code %d\n", - WEXITSTATUS(wstatus)); - abort(); - } + printf("%s nested sysctl 1\n", memfd_str); + sysctl_assert_write("1"); + /* Child inherits our setting. */ + pid = spawn_thread(CLONE_NEWPID, sysctl_nested, test_sysctl_sysctl1); + join_thread(pid); + /* Child cannot raise the setting. */ + pid = spawn_thread(CLONE_NEWPID, sysctl_nested, + test_sysctl_sysctl1_failset); + join_thread(pid); + /* Child can lower the setting. */ + pid = spawn_thread(CLONE_NEWPID, sysctl_nested, + test_sysctl_set_sysctl2); + join_thread(pid); + /* Child lowering the setting has no effect on our setting. */ + test_sysctl_sysctl1(); + + printf("%s nested sysctl 2\n", memfd_str); + sysctl_assert_write("2"); + /* Child inherits our setting. */ + pid = spawn_thread(CLONE_NEWPID, sysctl_nested, test_sysctl_sysctl2); + join_thread(pid); + /* Child cannot raise the setting. */ + pid = spawn_thread(CLONE_NEWPID, sysctl_nested, + test_sysctl_sysctl2_failset); + join_thread(pid); + + /* Verify that the rules are actually inherited after fork. */ + printf("%s nested sysctl 0 -> 1 after fork\n", memfd_str); + sysctl_assert_write("0"); - if (WIFSIGNALED(wstatus)) { - printf("newpid thread: killed by signal %d\n", - WTERMSIG(wstatus)); - abort(); - } + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, + test_sysctl_sysctl1_failset); + sysctl_assert_write("1"); + kill(pid, SIGCONT); + join_thread(pid); + + printf("%s nested sysctl 0 -> 2 after fork\n", memfd_str); + sysctl_assert_write("0"); + + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, + test_sysctl_sysctl2_failset); + sysctl_assert_write("2"); + kill(pid, SIGCONT); + join_thread(pid); + + /* + * Verify that the current effective setting is saved on fork, meaning + * that the parent lowering the sysctl doesn't affect already-forked + * children. + */ + printf("%s nested sysctl 2 -> 1 after fork\n", memfd_str); + sysctl_assert_write("2"); + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, + test_sysctl_sysctl2); + sysctl_assert_write("1"); + kill(pid, SIGCONT); + join_thread(pid); + + printf("%s nested sysctl 2 -> 0 after fork\n", memfd_str); + sysctl_assert_write("2"); + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, + test_sysctl_sysctl2); + sysctl_assert_write("0"); + kill(pid, SIGCONT); + join_thread(pid); + + printf("%s nested sysctl 1 -> 0 after fork\n", memfd_str); + sysctl_assert_write("1"); + pid = spawn_thread(CLONE_NEWPID, sysctl_nested_wait, + test_sysctl_sysctl1); + sysctl_assert_write("0"); + kill(pid, SIGCONT); + join_thread(pid); + + return 0; } /* - * Test sysctl - * A very basic sealing test to see whether setting/retrieving seals works. + * Test sysctl with nested pid namespaces + * Make sure that the sysctl nesting semantics work correctly. */ -static void test_sysctl(void) +static void test_sysctl_nested(void) { - int pid = spawn_newpid_thread(CLONE_NEWPID, newpid_thread_fn); + int pid = spawn_thread(CLONE_NEWPID, sysctl_nested_child, NULL); - join_newpid_thread(pid); + join_thread(pid); } /* @@ -1433,6 +1601,9 @@ int main(int argc, char **argv) test_seal_grow(); test_seal_resize(); + test_sysctl_simple(); + test_sysctl_nested(); + test_share_dup("SHARE-DUP", ""); test_share_mmap("SHARE-MMAP", ""); test_share_open("SHARE-OPEN", ""); @@ -1447,8 +1618,6 @@ int main(int argc, char **argv) test_share_fork("SHARE-FORK", SHARED_FT_STR); join_idle_thread(pid); - test_sysctl(); - printf("memfd: DONE\n"); return 0; From 889690bcbccb457d85cfd16ce0d8259ecdb10ce4 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:52:38 -0700 Subject: [PATCH 388/489] arm: include asm/cacheflush.h in asm/hugetlb.h Patch series "arch: include asm/cacheflush.h in asm/hugetlb.h". Three architectures are using PG_dcache_clean in their asm/hugetlb.h, but relying on accident to include the asm/cacheflush.h which defines it. This patch (of 3): PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/6d2acfa4-7f44-d3b4-b0a8-5495c5985e4c@google.com Link: https://lkml.kernel.org/r/4b055d0-7b2e-72bf-9b9d-8f3f1cd312d0@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index d02d6ca88e926a..a3a82b7158d4cb 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -10,6 +10,7 @@ #ifndef _ASM_ARM_HUGETLB_H #define _ASM_ARM_HUGETLB_H +#include #include #include #include From 1de8c835a936859f01a91174474a8b96d61b0400 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:53:17 -0700 Subject: [PATCH 389/489] arm64: include asm/cacheflush.h in asm/hugetlb.h PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/bd77cc1b-e83b-f276-9e27-c19e7c9119aa@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index a91d6219aa7883..f43a38ac17799d 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -10,6 +10,7 @@ #ifndef __ASM_HUGETLB_H #define __ASM_HUGETLB_H +#include #include #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION From 33a9fb09836ace6cb7bd48cf30ab22d56285560c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 13 Aug 2023 21:53:57 -0700 Subject: [PATCH 390/489] riscv: include asm/cacheflush.h in asm/hugetlb.h PG_dcache_clean is used in asm/hugetlb.h but defined in asm/cacheflush.h: builds rely on an accident of that being included via linux/mempolicy.h, but better include it directly (like arch/sh/include/asm/hugetlb.h does). Link: https://lkml.kernel.org/r/84bd3b96-8dbe-51b1-d7d1-6e4f9d8937d8@google.com Signed-off-by: Hugh Dickins Cc: Catalin Marinas Cc: Mike Kravetz Cc: Palmer Dabbelt Cc: Russell King Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/include/asm/hugetlb.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index ce1ebda1a49a79..34e24f078cc1b3 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _ASM_RISCV_HUGETLB_H #define _ASM_RISCV_HUGETLB_H +#include #include static inline void arch_clear_hugepage_flags(struct page *page) From 8dbbc49345a7452ee59783ed5f5637b5d59532ba Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Aug 2023 13:00:18 -0700 Subject: [PATCH 391/489] mm,thp: no space after colon in Mem-Info fields Patch series "mm,thp: fix sloppy text output". Three independent trivial patches, fixing sloppy text output which has annoyed me; but might risk surprising a parser, so any can be dropped. This patch (of 3): The SysRq-m or OOM Mem-Info dmesg showed (long lines containing) ... shmem:NkB shmem_thp: NkB shmem_pmdmapped: NkB anon_thp: NkB ... Delete the space after the colon after shmem_thp, shmem_pmdmapped, anon_thp: as the shmem example shows, no other fields have a space after the colon in this output. Link: https://lkml.kernel.org/r/dc264fd6-40bb-6510-db36-9340a5f01d94@google.com Link: https://lkml.kernel.org/r/c1edd7da-5493-c542-6feb-92452b4dab3b@google.com Signed-off-by: Hugh Dickins Reviewed-by: David Hildenbrand Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- mm/show_mem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/show_mem.c b/mm/show_mem.c index 09c7d036d49ecb..4b888b18bddea9 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -251,9 +251,9 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z " writeback:%lukB" " shmem:%lukB" #ifdef CONFIG_TRANSPARENT_HUGEPAGE - " shmem_thp: %lukB" - " shmem_pmdmapped: %lukB" - " anon_thp: %lukB" + " shmem_thp:%lukB" + " shmem_pmdmapped:%lukB" + " anon_thp:%lukB" #endif " writeback_tmp:%lukB" " kernel_stack:%lukB" From 4b5b7850c9282f9c7e646ec140b84b2d2f0aeeb8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Aug 2023 13:01:12 -0700 Subject: [PATCH 392/489] mm,thp: fix nodeN/meminfo output alignment Add one more space to FileHugePages and FilePmdMapped, so the output is aligned with other rows in /sys/devices/system/node/nodeN/meminfo. Link: https://lkml.kernel.org/r/be861b50-a790-e041-bcb0-2a987dcfd1a@google.com Signed-off-by: Hugh Dickins Reviewed-by: David Hildenbrand Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- drivers/base/node.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index 9de524e563074e..8e871ba9162f52 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -446,8 +446,8 @@ static ssize_t node_read_meminfo(struct device *dev, "Node %d AnonHugePages: %8lu kB\n" "Node %d ShmemHugePages: %8lu kB\n" "Node %d ShmemPmdMapped: %8lu kB\n" - "Node %d FileHugePages: %8lu kB\n" - "Node %d FilePmdMapped: %8lu kB\n" + "Node %d FileHugePages: %8lu kB\n" + "Node %d FilePmdMapped: %8lu kB\n" #endif #ifdef CONFIG_UNACCEPTED_MEMORY "Node %d Unaccepted: %8lu kB\n" From daa60ae64c6587e2be2cdf02bca21b59551ee0f6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 14 Aug 2023 13:02:08 -0700 Subject: [PATCH 393/489] mm,thp: fix smaps THPeligible output alignment Extract from current /proc/self/smaps output: Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 ProtectionKey: 0 That's not the alignment shown in Documentation/filesystems/proc.rst: it's an ugly artifact from missing out the %8 other fields are using; but there's even one selftest which expects it to look that way. Hoping no other smaps parsers depend on THPeligible to look so ugly, fix these. Link: https://lkml.kernel.org/r/cfb81f7a-f448-5bc2-b0e1-8136fcd1dd8c@google.com Signed-off-by: Hugh Dickins Cc: Alexey Dobriyan Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 2 +- tools/testing/selftests/proc/proc-empty-vm.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf25178ae66a93..74cbb00422eab4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -854,7 +854,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); - seq_printf(m, "THPeligible: %d\n", + seq_printf(m, "THPeligible: %8u\n", hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) diff --git a/tools/testing/selftests/proc/proc-empty-vm.c b/tools/testing/selftests/proc/proc-empty-vm.c index 7588428b8fcd7d..e1052443d070ed 100644 --- a/tools/testing/selftests/proc/proc-empty-vm.c +++ b/tools/testing/selftests/proc/proc-empty-vm.c @@ -77,7 +77,7 @@ static const char proc_pid_smaps_vsyscall_1[] = "Swap: 0 kB\n" "SwapPss: 0 kB\n" "Locked: 0 kB\n" -"THPeligible: 0\n" +"THPeligible: 0\n" /* * "ProtectionKey:" field is conditional. It is possible to check it as well, * but I don't have such machine. @@ -107,7 +107,7 @@ static const char proc_pid_smaps_vsyscall_2[] = "Swap: 0 kB\n" "SwapPss: 0 kB\n" "Locked: 0 kB\n" -"THPeligible: 0\n" +"THPeligible: 0\n" /* * "ProtectionKey:" field is conditional. It is possible to check it as well, * but I'm too tired. From 1b6754fea43cebd72d969618219347c5ec01eb8d Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 12 Aug 2023 11:01:28 +0000 Subject: [PATCH 394/489] writeback: remove unused delaration of bdi_async_bio_wq It seems it was introduced by commit d3f77dfdc718 ("blkcg: implement REQ_CGROUP_PUNT") unintentionally, but the definition does not exist, remove it. Link: https://lkml.kernel.org/r/20230812110128.482650-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Acked-by: Tejun Heo Cc: Stefan Roesch Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fbad4fcd408e2b..1a97277f99b1b8 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -46,7 +46,6 @@ extern spinlock_t bdi_lock; extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -extern struct workqueue_struct *bdi_async_bio_wq; static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { From 7e2fca52ef918e5c983391f984ed5c98b0dea6a1 Mon Sep 17 00:00:00 2001 From: ZhangPeng Date: Sat, 12 Aug 2023 14:26:12 +0800 Subject: [PATCH 395/489] mm/secretmem: use a folio in secretmem_fault() Saves four implicit call to compound_head(). Link: https://lkml.kernel.org/r/20230812062612.3184990-1-zhangpeng362@huawei.com Signed-off-by: ZhangPeng Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Kefeng Wang Cc: Nanyong Sun Signed-off-by: Andrew Morton --- mm/secretmem.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 86442a15d12f2c..3afb5ad701e14a 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -55,6 +55,7 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) gfp_t gfp = vmf->gfp_mask; unsigned long addr; struct page *page; + struct folio *folio; vm_fault_t ret; int err; @@ -66,23 +67,24 @@ static vm_fault_t secretmem_fault(struct vm_fault *vmf) retry: page = find_lock_page(mapping, offset); if (!page) { - page = alloc_page(gfp | __GFP_ZERO); - if (!page) { + folio = folio_alloc(gfp | __GFP_ZERO, 0); + if (!folio) { ret = VM_FAULT_OOM; goto out; } + page = &folio->page; err = set_direct_map_invalid_noflush(page); if (err) { - put_page(page); + folio_put(folio); ret = vmf_error(err); goto out; } - __SetPageUptodate(page); - err = add_to_page_cache_lru(page, mapping, offset, gfp); + __folio_mark_uptodate(folio); + err = filemap_add_folio(mapping, folio, offset, gfp); if (unlikely(err)) { - put_page(page); + folio_put(folio); /* * If a split of large page was required, it * already happened when we marked the page invalid From 0790e1e2b1b71ba357e89e779451efe79dff28e6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 12 Aug 2023 01:20:33 +0100 Subject: [PATCH 396/489] mm: allow fault_dirty_shared_page() to be called under the VMA lock By making maybe_unlock_mmap_for_io() handle the VMA lock correctly, we make fault_dirty_shared_page() safe to be called without the mmap lock held. Link: https://lkml.kernel.org/r/20230812002033.1002367-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: David Hildenbrand Tested-by: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/internal.h b/mm/internal.h index a037b1b37f6d78..c6ed10f0a5ad4a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -706,7 +706,7 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, if (fault_flag_allow_retry_first(flags) && !(flags & FAULT_FLAG_RETRY_NOWAIT)) { fpin = get_file(vmf->vma->vm_file); - mmap_read_unlock(vmf->vma->vm_mm); + release_fault_lock(vmf); } return fpin; } From b348b5fe2b5f14ac8bb64fe271d7a027db8cc674 Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Fri, 11 Aug 2023 12:36:55 -0700 Subject: [PATCH 397/489] mm/ksm: add pages scanned metric ksm currently maintains several statistics, which let you determine how successful KSM is at sharing pages. However it does not contain a metric to determine how much work it does. This commit adds the pages scanned metric. This allows the administrator to determine how many pages have been scanned over a period of time. Link: https://lkml.kernel.org/r/20230811193655.2518943-1-shr@devkernel.io Signed-off-by: Stefan Roesch Acked-by: David Hildenbrand Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 2 ++ mm/ksm.c | 16 +++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index 5c5be7bd84b817..776f244bdae4ba 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -159,6 +159,8 @@ The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``: general_profit how effective is KSM. The calculation is explained below. +pages_scanned + how many pages are being scanned for ksm pages_shared how many shared pages are being used pages_sharing diff --git a/mm/ksm.c b/mm/ksm.c index 97a9627116fa34..2653099539f273 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -242,6 +242,9 @@ static struct kmem_cache *rmap_item_cache; static struct kmem_cache *stable_node_cache; static struct kmem_cache *mm_slot_cache; +/* The number of pages scanned */ +static unsigned long ksm_pages_scanned; + /* The number of nodes in the stable tree */ static unsigned long ksm_pages_shared; @@ -2476,8 +2479,9 @@ static void ksm_do_scan(unsigned int scan_npages) { struct ksm_rmap_item *rmap_item; struct page *page; + unsigned int npages = scan_npages; - while (scan_npages-- && likely(!freezing(current))) { + while (npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -2485,6 +2489,8 @@ static void ksm_do_scan(unsigned int scan_npages) cmp_and_merge_page(page, rmap_item); put_page(page); } + + ksm_pages_scanned += scan_npages - npages; } static int ksmd_should_run(void) @@ -3323,6 +3329,13 @@ static ssize_t max_page_sharing_store(struct kobject *kobj, } KSM_ATTR(max_page_sharing); +static ssize_t pages_scanned_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lu\n", ksm_pages_scanned); +} +KSM_ATTR_RO(pages_scanned); + static ssize_t pages_shared_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -3431,6 +3444,7 @@ static struct attribute *ksm_attrs[] = { &sleep_millisecs_attr.attr, &pages_to_scan_attr.attr, &run_attr.attr, + &pages_scanned_attr.attr, &pages_shared_attr.attr, &pages_sharing_attr.attr, &pages_unshared_attr.attr, From 835bc157da689f834d10296d0942aed88b310b4f Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Tue, 15 Aug 2023 22:41:27 +0800 Subject: [PATCH 398/489] mm/kmemleak: use object_cache instead of kmemleak_initialized to check in set_track_prepare() Patch series "mm/kmemleak: use object_cache instead of kmemleak_initialized", v3. Use object_cache instead of kmemleak_initialized to check in set_track_prepare(), so that memory leaks after kmemleak_init() can be recorded and Rename kmemleak_initialized to kmemleak_late_initialized unreferenced object 0xc674ca80 (size 64): comm "swapper/0", pid 1, jiffies 4294938337 (age 204.880s) hex dump (first 32 bytes): 80 55 75 c6 80 54 75 c6 00 55 75 c6 80 52 75 c6 .Uu..Tu..Uu..Ru. 00 53 75 c6 00 00 00 00 00 00 00 00 00 00 00 00 .Su.......... This patch (of 2): kmemleak_initialized is set in kmemleak_late_init(), which also means that there is no call trace which object's memory leak is before kmemleak_late_init(), so use object_cache instead of kmemleak_initialized to check in set_track_prepare() to avoid no call trace records when there is a memory leak in the code between kmemleak_init() and kmemleak_late_init(). unreferenced object 0xc674ca80 (size 64): comm "swapper/0", pid 1, jiffies 4294938337 (age 204.880s) hex dump (first 32 bytes): 80 55 75 c6 80 54 75 c6 00 55 75 c6 80 52 75 c6 .Uu..Tu..Uu..Ru. 00 53 75 c6 00 00 00 00 00 00 00 00 00 00 00 00 .Su.......... Link: https://lkml.kernel.org/r/20230815144128.3623103-1-xiaolei.wang@windriver.com Link: https://lkml.kernel.org/r/20230815144128.3623103-2-xiaolei.wang@windriver.com Fixes: 56a61617dd22 ("mm: use stack_depot for recording kmemleak's backtrace") Signed-off-by: Xiaolei Wang Reviewed-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vlastimil Babka Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index a2d34226e3c8c3..16fc7b0984b9e5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -610,7 +610,12 @@ static noinline depot_stack_handle_t set_track_prepare(void) unsigned long entries[MAX_TRACE]; unsigned int nr_entries; - if (!kmemleak_initialized) + /* + * Use object_cache to determine whether kmemleak_init() has + * been invoked. stack_depot_early_init() is called before + * kmemleak_init() in mm_core_init(). + */ + if (!object_cache) return 0; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); trace_handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); From d160ef71b42c53573fc27188c20270a2ccdcc196 Mon Sep 17 00:00:00 2001 From: Xiaolei Wang Date: Tue, 15 Aug 2023 22:41:28 +0800 Subject: [PATCH 399/489] Rename kmemleak_initialized to kmemleak_late_initialized The old name is confusing because it implies the completion of earlier kmemleak_init(), the new name update to kmemleak_late_initial represents the completion of kmemleak_late_init(). No functional changes. Link: https://lkml.kernel.org/r/20230815144128.3623103-3-xiaolei.wang@windriver.com Signed-off-by: Xiaolei Wang Acked-by: Catalin Marinas Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Vlastimil Babka Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/kmemleak.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 16fc7b0984b9e5..2918150e31bd9c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -218,7 +218,7 @@ static int kmemleak_enabled = 1; /* same as above but only for the kmemleak_free() callback */ static int kmemleak_free_enabled = 1; /* set in the late_initcall if there were no errors */ -static int kmemleak_initialized; +static int kmemleak_late_initialized; /* set if a kmemleak warning was issued */ static int kmemleak_warning; /* set if a fatal kmemleak error has occurred */ @@ -2057,7 +2057,7 @@ static void kmemleak_disable(void) kmemleak_enabled = 0; /* check whether it is too early for a kernel thread */ - if (kmemleak_initialized) + if (kmemleak_late_initialized) schedule_work(&cleanup_work); else kmemleak_free_enabled = 0; @@ -2122,7 +2122,7 @@ void __init kmemleak_init(void) */ static int __init kmemleak_late_init(void) { - kmemleak_initialized = 1; + kmemleak_late_initialized = 1; debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops); @@ -2130,7 +2130,7 @@ static int __init kmemleak_late_init(void) /* * Some error occurred and kmemleak was disabled. There is a * small chance that kmemleak_disable() was called immediately - * after setting kmemleak_initialized and we may end up with + * after setting kmemleak_late_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. */ schedule_work(&cleanup_work); From e45a2e947dfa6da2d73e2cf91ed6399c12522d4f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 15 Aug 2023 11:06:09 +0800 Subject: [PATCH 400/489] pagemap: remove wait_on_page_locked_killable() There is no users of wait_on_page_locked_killable(), remove it. Link: https://lkml.kernel.org/r/20230815030609.39313-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0ab0f2362b9b7b..f4f24b594cd7ac 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1060,11 +1060,6 @@ static inline void wait_on_page_locked(struct page *page) folio_wait_locked(page_folio(page)); } -static inline int wait_on_page_locked_killable(struct page *page) -{ - return folio_wait_locked_killable(page_folio(page)); -} - void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); From b1e1296d7c6a3520b97add2394361660d193a5ea Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:04 +0200 Subject: [PATCH 401/489] kvm: explicitly set FOLL_HONOR_NUMA_FAULT in hva_to_pfn_slow() KVM is *the* case we know that really wants to honor NUMA hinting falls. As we want to stop setting FOLL_HONOR_NUMA_FAULT implicitly, set FOLL_HONOR_NUMA_FAULT whenever we might obtain pages on behalf of a VCPU to map them into a secondary MMU, and add a comment why. Do that unconditionally in hva_to_pfn_slow() when calling get_user_pages_unlocked(). kvmppc_book3s_instantiate_page(), hva_to_pfn_fast() and gfn_to_page_many_atomic() are similarly used to map pages into a secondary MMU. However, FOLL_WRITE and get_user_page_fast_only() always implicitly honor NUMA hinting faults -- as documented for FOLL_HONOR_NUMA_FAULT -- so we can limit this change to a single location for now. Don't set it in check_user_page_hwpoison(), where we really only want to check if the mapped page is HW-poisoned. We won't set it for other KVM users of get_user_pages()/pin_user_pages() * arch/powerpc/kvm/book3s_64_mmu_hv.c: not used to map pages into a secondary MMU. * arch/powerpc/kvm/e500_mmu.c: only used on shared TLB pages with userspace * arch/s390/kvm/*: s390x only supports a single NUMA node either way * arch/x86/kvm/svm/sev.c: not used to map pages into a secondary MMU. This is a preparation for making FOLL_HONOR_NUMA_FAULT no longer implicitly be set by get_user_pages() and friends. Link: https://lkml.kernel.org/r/20230803143208.383663-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Mel Gorman Cc: Paolo Bonzini Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- virt/kvm/kvm_main.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5bbb5612b207fb..2500178cf4449c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2517,7 +2517,18 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, bool interruptible, bool *writable, kvm_pfn_t *pfn) { - unsigned int flags = FOLL_HWPOISON; + /* + * When a VCPU accesses a page that is not mapped into the secondary + * MMU, we lookup the page using GUP to map it, so the guest VCPU can + * make progress. We always want to honor NUMA hinting faults in that + * case, because GUP usage corresponds to memory accesses from the VCPU. + * Otherwise, we'd not trigger NUMA hinting faults once a page is + * mapped into the secondary MMU and gets accessed by a VCPU. + * + * Note that get_user_page_fast_only() and FOLL_WRITE for now + * implicitly honor NUMA hinting faults and don't need this flag. + */ + unsigned int flags = FOLL_HWPOISON | FOLL_HONOR_NUMA_FAULT; struct page *page; int npages; From 7acddcc1ae30670449d60dc9da5b00d544a5b58b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:05 +0200 Subject: [PATCH 402/489] mm/gup: don't implicitly set FOLL_HONOR_NUMA_FAULT Commit 0b9d705297b2 ("mm: numa: Support NUMA hinting page faults from gup/gup_fast") from 2012 documented as the primary reason why we would want to handle NUMA hinting faults from GUP: KVM secondary MMU page faults will trigger the NUMA hinting page faults through gup_fast -> get_user_pages -> follow_page -> handle_mm_fault. That is still the case today, and relevant KVM code has been converted to manually set FOLL_HONOR_NUMA_FAULT. So let's stop setting FOLL_HONOR_NUMA_FAULT for all GUP users and cross fingers that not that many other ones that really require such handling for autonuma remain. Possible interaction with MMU notifiers: Assume a driver obtains a page using get_user_pages() to map it into a secondary MMU, and uses the MMU notifier framework to get notified on changes. Assume get_user_pages() succeeded on a PROT_NONE-mapped page (because FOLL_HONOR_NUMA_FAULT is not set) in an accessible VMA and the page is mapped into a secondary MMU. Once user space would turn that mapping inaccessible using mprotect(PROT_NONE), the actual PTE in the page table might not change. If the MMU notifier would be smart and optimize for that case "why notify if the PTE didn't change", that could be problematic. At least change_pmd_range() with MMU_NOTIFY_PROTECTION_VMA for now does an unconditional mmu_notifier_invalidate_range_start() -> mmu_notifier_invalidate_range_end() and should be fine. Note that even if a PTE in an accessible VMA is pte_protnone(), the underlying page might be accessed by a secondary MMU that does not set FOLL_HONOR_NUMA_FAULT, and test_young() MMU notifiers would return "true". Link: https://lkml.kernel.org/r/20230803143208.383663-5-david@redhat.com Signed-off-by: David Hildenbrand Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Mel Gorman Cc: Paolo Bonzini Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/gup.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 3bbfae41188050..ee4fc15ce88eb2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2244,13 +2244,6 @@ static bool is_valid_gup_args(struct page **pages, int *locked, gup_flags |= FOLL_UNLOCKABLE; } - /* - * For now, always trigger NUMA hinting faults. Some GUP users like - * KVM require the hint to be as the calling context of GUP is - * functionally similar to a memory reference from task context. - */ - gup_flags |= FOLL_HONOR_NUMA_FAULT; - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) From 14fb1fd751fa09440634b909e6f5f53e2cba9ae0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:06 +0200 Subject: [PATCH 403/489] pgtable: improve pte_protnone() comment Especially the "For PROT_NONE VMAs, the PTEs are not marked _PAGE_PROTNONE" part is wrong: doing an mprotect(PROT_NONE) will end up marking all PTEs on x86_64 as _PAGE_PROTNONE, making pte_protnone() indicate "yes". So let's improve the comment, so it's easier to grasp which semantics pte_protnone() actually has. Link: https://lkml.kernel.org/r/20230803143208.383663-6-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Mel Gorman Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Paolo Bonzini Cc: Peter Xu Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f34e0f2cb4d847..6064f454c8e3fb 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1333,12 +1333,16 @@ static inline int pud_trans_unstable(pud_t *pud) #ifndef CONFIG_NUMA_BALANCING /* - * Technically a PTE can be PROTNONE even when not doing NUMA balancing but - * the only case the kernel cares is for NUMA balancing and is only ever set - * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked - * _PAGE_PROTNONE so by default, implement the helper as "always no". It - * is the responsibility of the caller to distinguish between PROT_NONE - * protections and NUMA hinting fault protections. + * In an inaccessible (PROT_NONE) VMA, pte_protnone() may indicate "yes". It is + * perfectly valid to indicate "no" in that case, which is why our default + * implementation defaults to "always no". + * + * In an accessible VMA, however, pte_protnone() reliably indicates PROT_NONE + * page protection due to NUMA hinting. NUMA hinting faults only apply in + * accessible VMAs. + * + * So, to reliably identify PROT_NONE PTEs that require a NUMA hinting fault, + * looking at the VMA accessibility is sufficient. */ static inline int pte_protnone(pte_t pte) { From 42096aa24b82f54d486c501148afb6048e3830a1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:07 +0200 Subject: [PATCH 404/489] selftest/mm: ksm_functional_tests: test in mmap_and_merge_range() if anything got merged Let's extend mmap_and_merge_range() to test if anything in the current process was merged. range_maps_duplicates() is too unreliable for that use case, so instead look at KSM stats. Trigger a complete unmerge first, to cleanup the stable tree and stabilize accounting of merged pages. Note that we're using /proc/self/ksm_merging_pages instead of /proc/self/ksm_stat, because that one is available in more existing kernels. If /proc/self/ksm_merging_pages can't be opened, we can't perform any checks and simply skip them. We have to special-case the shared zeropage for now. But the only user -- test_unmerge_zero_pages() -- performs its own merge checks. Link: https://lkml.kernel.org/r/20230803143208.383663-7-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Mel Gorman Cc: Paolo Bonzini Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index 0de9d33cd565d8..cb63b600cb4f54 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -30,6 +30,7 @@ static int ksm_fd; static int ksm_full_scans_fd; static int proc_self_ksm_stat_fd; +static int proc_self_ksm_merging_pages_fd; static int ksm_use_zero_pages_fd; static int pagemap_fd; static size_t pagesize; @@ -88,6 +89,22 @@ static long get_my_ksm_zero_pages(void) return my_ksm_zero_pages; } +static long get_my_merging_pages(void) +{ + char buf[10]; + ssize_t ret; + + if (proc_self_ksm_merging_pages_fd < 0) + return proc_self_ksm_merging_pages_fd; + + ret = pread(proc_self_ksm_merging_pages_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + static long ksm_get_full_scans(void) { char buf[10]; @@ -120,11 +137,29 @@ static int ksm_merge(void) return 0; } +static int ksm_unmerge(void) +{ + if (write(ksm_fd, "2", 1) != 1) + return -errno; + return 0; +} + static char *mmap_and_merge_range(char val, unsigned long size, bool use_prctl) { char *map; int ret; + /* Stabilize accounting by disabling KSM completely. */ + if (ksm_unmerge()) { + ksft_test_result_fail("Disabling (unmerging) KSM failed\n"); + goto unmap; + } + + if (get_my_merging_pages() > 0) { + ksft_test_result_fail("Still pages merged\n"); + goto unmap; + } + map = mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); if (map == MAP_FAILED) { @@ -160,6 +195,16 @@ static char *mmap_and_merge_range(char val, unsigned long size, bool use_prctl) ksft_test_result_fail("Running KSM failed\n"); goto unmap; } + + /* + * Check if anything was merged at all. Ignore the zero page that is + * accounted differently (depending on kernel support). + */ + if (val && !get_my_merging_pages()) { + ksft_test_result_fail("No pages got merged\n"); + goto unmap; + } + return map; unmap: munmap(map, size); @@ -473,6 +518,8 @@ int main(int argc, char **argv) if (pagemap_fd < 0) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); proc_self_ksm_stat_fd = open("/proc/self/ksm_stat", O_RDONLY); + proc_self_ksm_merging_pages_fd = open("/proc/self/ksm_merging_pages", + O_RDONLY); ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); test_unmerge(); From e5013f11c6c92f58134418d7caf3098c13413c4c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 3 Aug 2023 16:32:08 +0200 Subject: [PATCH 405/489] selftest/mm: ksm_functional_tests: Add PROT_NONE test Let's test whether merging and unmerging in PROT_NONE areas works as expected. Pass a page protection to mmap_and_merge_range(), which will trigger an mprotect() after writing to the pages, but before enabling merging. Make sure that unsharing works as expected, by performing a ptrace write (using /proc/self/mem) and by setting MADV_UNMERGEABLE. Note that this implicitly tests that ptrace writes in an inaccessible (PROT_NONE) mapping work as expected. [david@redhat.com: use sizeof(i) in test_prot_none(), per Peter] Link: https://lkml.kernel.org/r/e9cdb144-70c7-6596-2377-e675635c94e0@redhat.com Link: https://lkml.kernel.org/r/20230803143208.383663-8-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Peter Xu Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Linus Torvalds Cc: liubo Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Mel Gorman Cc: Paolo Bonzini Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/ksm_functional_tests.c | 59 ++++++++++++++++--- 1 file changed, 52 insertions(+), 7 deletions(-) diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index cb63b600cb4f54..901e950f9138c7 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -27,6 +27,7 @@ #define KiB 1024u #define MiB (1024 * KiB) +static int mem_fd; static int ksm_fd; static int ksm_full_scans_fd; static int proc_self_ksm_stat_fd; @@ -144,7 +145,8 @@ static int ksm_unmerge(void) return 0; } -static char *mmap_and_merge_range(char val, unsigned long size, bool use_prctl) +static char *mmap_and_merge_range(char val, unsigned long size, int prot, + bool use_prctl) { char *map; int ret; @@ -176,6 +178,11 @@ static char *mmap_and_merge_range(char val, unsigned long size, bool use_prctl) /* Make sure each page contains the same values to merge them. */ memset(map, val, size); + if (mprotect(map, size, prot)) { + ksft_test_result_skip("mprotect() failed\n"); + goto unmap; + } + if (use_prctl) { ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { @@ -218,7 +225,7 @@ static void test_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); if (map == MAP_FAILED) return; @@ -256,7 +263,7 @@ static void test_unmerge_zero_pages(void) } /* Let KSM deduplicate zero pages. */ - map = mmap_and_merge_range(0x00, size, false); + map = mmap_and_merge_range(0x00, size, PROT_READ | PROT_WRITE, false); if (map == MAP_FAILED) return; @@ -304,7 +311,7 @@ static void test_unmerge_discarded(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); if (map == MAP_FAILED) return; @@ -336,7 +343,7 @@ static void test_unmerge_uffd_wp(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, false); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, false); if (map == MAP_FAILED) return; @@ -479,7 +486,7 @@ static void test_prctl_unmerge(void) ksft_print_msg("[RUN] %s\n", __func__); - map = mmap_and_merge_range(0xcf, size, true); + map = mmap_and_merge_range(0xcf, size, PROT_READ | PROT_WRITE, true); if (map == MAP_FAILED) return; @@ -494,9 +501,42 @@ static void test_prctl_unmerge(void) munmap(map, size); } +static void test_prot_none(void) +{ + const unsigned int size = 2 * MiB; + char *map; + int i; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0x11, size, PROT_NONE, false); + if (map == MAP_FAILED) + goto unmap; + + /* Store a unique value in each page on one half using ptrace */ + for (i = 0; i < size / 2; i += pagesize) { + lseek(mem_fd, (uintptr_t) map + i, SEEK_SET); + if (write(mem_fd, &i, sizeof(i)) != sizeof(i)) { + ksft_test_result_fail("ptrace write failed\n"); + goto unmap; + } + } + + /* Trigger unsharing on the other half. */ + if (madvise(map + size / 2, size / 2, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + int main(int argc, char **argv) { - unsigned int tests = 6; + unsigned int tests = 7; int err; #ifdef __NR_userfaultfd @@ -508,6 +548,9 @@ int main(int argc, char **argv) pagesize = getpagesize(); + mem_fd = open("/proc/self/mem", O_RDWR); + if (mem_fd < 0) + ksft_exit_fail_msg("opening /proc/self/mem failed\n"); ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); if (ksm_fd < 0) ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); @@ -529,6 +572,8 @@ int main(int argc, char **argv) test_unmerge_uffd_wp(); #endif + test_prot_none(); + test_prctl(); test_prctl_fork(); test_prctl_unmerge(); From 99a9e0b83ab9955e604397717b82267feb021df3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:49 +0100 Subject: [PATCH 406/489] io_uring: stop calling free_compound_page() Patch series "Remove _folio_dtor and _folio_order", v2. This patch (of 13): folio_put() is the standard way to write this, and it's not appreciably slower. This is an enabling patch for removing free_compound_page() entirely. Link: https://lkml.kernel.org/r/20230816151201.3655946-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230816151201.3655946-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- io_uring/io_uring.c | 6 +----- io_uring/kbuf.c | 6 +----- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f4591b912ea8e9..6adf3b4799144c 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2643,14 +2643,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, static void io_mem_free(void *ptr) { - struct page *page; - if (!ptr) return; - page = virt_to_head_page(ptr); - if (put_page_testzero(page)) - free_compound_page(page); + folio_put(virt_to_folio(ptr)); } static void io_pages_free(struct page ***pages, int npages) diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index 2f0181521c98e4..556f4df25b0fa0 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -218,11 +218,7 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, if (bl->is_mapped) { i = bl->buf_ring->tail - bl->head; if (bl->is_mmap) { - struct page *page; - - page = virt_to_head_page(bl->buf_ring); - if (put_page_testzero(page)) - free_compound_page(page); + folio_put(virt_to_folio(bl->buf_ring)); bl->buf_ring = NULL; bl->is_mmap = 0; } else if (bl->buf_nr_pages) { From dd6fa0b61814492476463149c91110e529364e82 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:50 +0100 Subject: [PATCH 407/489] mm: call free_huge_page() directly Indirect calls are expensive, thanks to Spectre. Call free_huge_page() directly if the folio belongs to hugetlb. Link: https://lkml.kernel.org/r/20230816151201.3655946-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 3 ++- mm/page_alloc.c | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0a393bc02f25b4..5a1dfaffbd8064 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -26,6 +26,8 @@ typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) #endif +void free_huge_page(struct page *page); + #ifdef CONFIG_HUGETLB_PAGE #include @@ -165,7 +167,6 @@ int get_huge_page_for_hwpoison(unsigned long pfn, int flags, bool *migratable_cleared); void folio_putback_active_hugetlb(struct folio *folio); void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int reason); -void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 986b56db96b5bf..74484859336038 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,9 +287,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = { static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, -#ifdef CONFIG_HUGETLB_PAGE - [HUGETLB_PAGE_DTOR] = free_huge_page, -#endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, #endif @@ -612,6 +609,11 @@ void destroy_large_folio(struct folio *folio) { enum compound_dtor_id dtor = folio->_folio_dtor; + if (folio_test_hugetlb(folio)) { + free_huge_page(&folio->page); + return; + } + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); } From 454a00c40a21c59e99c526fe8cc57bd029cf8f0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:51 +0100 Subject: [PATCH 408/489] mm: convert free_huge_page() to free_huge_folio() Pass a folio instead of the head page to save a few instructions. Update the documentation, at least in English. Link: https://lkml.kernel.org/r/20230816151201.3655946-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Sidhartha Kumar Cc: Yanteng Si Cc: David Hildenbrand Cc: Jens Axboe Signed-off-by: Andrew Morton --- Documentation/mm/hugetlbfs_reserv.rst | 14 +++--- .../zh_CN/mm/hugetlbfs_reserv.rst | 4 +- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 48 +++++++++---------- mm/page_alloc.c | 2 +- 5 files changed, 34 insertions(+), 36 deletions(-) diff --git a/Documentation/mm/hugetlbfs_reserv.rst b/Documentation/mm/hugetlbfs_reserv.rst index d9c2b0f01dcd0f..4914fbf07966c5 100644 --- a/Documentation/mm/hugetlbfs_reserv.rst +++ b/Documentation/mm/hugetlbfs_reserv.rst @@ -271,12 +271,12 @@ to the global reservation count (resv_huge_pages). Freeing Huge Pages ================== -Huge page freeing is performed by the routine free_huge_page(). This routine -is the destructor for hugetlbfs compound pages. As a result, it is only -passed a pointer to the page struct. When a huge page is freed, reservation -accounting may need to be performed. This would be the case if the page was -associated with a subpool that contained reserves, or the page is being freed -on an error path where a global reserve count must be restored. +Huge pages are freed by free_huge_folio(). It is only passed a pointer +to the folio as it is called from the generic MM code. When a huge page +is freed, reservation accounting may need to be performed. This would +be the case if the page was associated with a subpool that contained +reserves, or the page is being freed on an error path where a global +reserve count must be restored. The page->private field points to any subpool associated with the page. If the PagePrivate flag is set, it indicates the global reserve count should @@ -525,7 +525,7 @@ However, there are several instances where errors are encountered after a huge page is allocated but before it is instantiated. In this case, the page allocation has consumed the reservation and made the appropriate subpool, reservation map and global count adjustments. If the page is freed at this -time (before instantiation and clearing of PagePrivate), then free_huge_page +time (before instantiation and clearing of PagePrivate), then free_huge_folio will increment the global reservation count. However, the reservation map indicates the reservation was consumed. This resulting inconsistent state will cause the 'leak' of a reserved huge page. The global reserve count will diff --git a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst index b7a0544224ad1a..0f7e7fb5ca8cd3 100644 --- a/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst +++ b/Documentation/translations/zh_CN/mm/hugetlbfs_reserv.rst @@ -219,7 +219,7 @@ vma_commit_reservation()之间,预留映射有可能被改变。如果hugetlb_ 释放巨页 ======== -巨页释放是由函数free_huge_page()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 +巨页释放是由函数free_huge_folio()执行的。这个函数是hugetlbfs复合页的析构器。因此,它只传 递一个指向页面结构体的指针。当一个巨页被释放时,可能需要进行预留计算。如果该页与包含保 留的子池相关联,或者该页在错误路径上被释放,必须恢复全局预留计数,就会出现这种情况。 @@ -387,7 +387,7 @@ region_count()在解除私有巨页映射时被调用。在私有映射中,预 然而,有几种情况是,在一个巨页被分配后,但在它被实例化之前,就遇到了错误。在这种情况下, 页面分配已经消耗了预留,并进行了适当的子池、预留映射和全局计数调整。如果页面在这个时候被释放 -(在实例化和清除PagePrivate之前),那么free_huge_page将增加全局预留计数。然而,预留映射 +(在实例化和清除PagePrivate之前),那么free_huge_folio将增加全局预留计数。然而,预留映射 显示报留被消耗了。这种不一致的状态将导致预留的巨页的 “泄漏” 。全局预留计数将比它原本的要高, 并阻止分配一个预先分配的页面。 diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 5a1dfaffbd8064..5b2626063f4fdd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -26,7 +26,7 @@ typedef struct { unsigned long pd; } hugepd_t; #define __hugepd(x) ((hugepd_t) { (x) }) #endif -void free_huge_page(struct page *page); +void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5f498e8025cc50..6a3c80026ab3cd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1706,10 +1706,10 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, zeroed = folio_put_testzero(folio); if (unlikely(!zeroed)) /* - * It is VERY unlikely soneone else has taken a ref on - * the page. In this case, we simply return as the - * hugetlb destructor (free_huge_page) will be called - * when this other ref is dropped. + * It is VERY unlikely soneone else has taken a ref + * on the folio. In this case, we simply return as + * free_huge_folio() will be called when this other ref + * is dropped. */ return; @@ -1875,13 +1875,12 @@ struct hstate *size_to_hstate(unsigned long size) return NULL; } -void free_huge_page(struct page *page) +void free_huge_folio(struct folio *folio) { /* * Can't pass hstate in here because it is called from the * compound page destructor. */ - struct folio *folio = page_folio(page); struct hstate *h = folio_hstate(folio); int nid = folio_nid(folio); struct hugepage_subpool *spool = hugetlb_folio_subpool(folio); @@ -1936,7 +1935,7 @@ void free_huge_page(struct page *page) spin_unlock_irqrestore(&hugetlb_lock, flags); update_and_free_hugetlb_folio(h, folio, true); } else { - arch_clear_hugepage_flags(page); + arch_clear_hugepage_flags(&folio->page); enqueue_hugetlb_folio(h, folio); spin_unlock_irqrestore(&hugetlb_lock, flags); } @@ -2246,7 +2245,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, folio = alloc_fresh_hugetlb_folio(h, gfp_mask, node, nodes_allowed, node_alloc_noretry); if (folio) { - free_huge_page(&folio->page); /* free it into the hugepage allocator */ + free_huge_folio(folio); /* free it into the hugepage allocator */ return 1; } } @@ -2429,13 +2428,13 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, * We could have raced with the pool size change. * Double check that and simply deallocate the new page * if we would end up overcommiting the surpluses. Abuse - * temporary page to workaround the nasty free_huge_page + * temporary page to workaround the nasty free_huge_folio * codeflow */ if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { folio_set_hugetlb_temporary(folio); spin_unlock_irq(&hugetlb_lock); - free_huge_page(&folio->page); + free_huge_folio(folio); return NULL; } @@ -2547,8 +2546,7 @@ static int gather_surplus_pages(struct hstate *h, long delta) __must_hold(&hugetlb_lock) { LIST_HEAD(surplus_list); - struct folio *folio; - struct page *page, *tmp; + struct folio *folio, *tmp; int ret; long i; long needed, allocated; @@ -2608,21 +2606,21 @@ static int gather_surplus_pages(struct hstate *h, long delta) ret = 0; /* Free the needed pages to the hugetlb pool */ - list_for_each_entry_safe(page, tmp, &surplus_list, lru) { + list_for_each_entry_safe(folio, tmp, &surplus_list, lru) { if ((--needed) < 0) break; /* Add the page to the hugetlb allocator */ - enqueue_hugetlb_folio(h, page_folio(page)); + enqueue_hugetlb_folio(h, folio); } free: spin_unlock_irq(&hugetlb_lock); /* * Free unnecessary surplus pages to the buddy allocator. - * Pages have no ref count, call free_huge_page directly. + * Pages have no ref count, call free_huge_folio directly. */ - list_for_each_entry_safe(page, tmp, &surplus_list, lru) - free_huge_page(page); + list_for_each_entry_safe(folio, tmp, &surplus_list, lru) + free_huge_folio(folio); spin_lock_irq(&hugetlb_lock); return ret; @@ -2836,11 +2834,11 @@ static long vma_del_reservation(struct hstate *h, * 2) No reservation was in place for the page, so hugetlb_restore_reserve is * not set. However, alloc_hugetlb_folio always updates the reserve map. * - * In case 1, free_huge_page later in the error path will increment the - * global reserve count. But, free_huge_page does not have enough context + * In case 1, free_huge_folio later in the error path will increment the + * global reserve count. But, free_huge_folio does not have enough context * to adjust the reservation map. This case deals primarily with private * mappings. Adjust the reserve map here to be consistent with global - * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve count adjustments to be made by free_huge_folio. Make sure the * reserve map indicates there is a reservation present. * * In case 2, simply undo reserve map modifications done by alloc_hugetlb_folio. @@ -2856,7 +2854,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, * Rare out of memory condition in reserve map * manipulation. Clear hugetlb_restore_reserve so * that global reserve count will not be incremented - * by free_huge_page. This will make it appear + * by free_huge_folio. This will make it appear * as though the reservation for this folio was * consumed. This may prevent the task from * faulting in the folio at a later time. This @@ -3232,7 +3230,7 @@ static void __init gather_bootmem_prealloc(void) if (prep_compound_gigantic_folio(folio, huge_page_order(h))) { WARN_ON(folio_test_reserved(folio)); prep_new_hugetlb_folio(h, folio, folio_nid(folio)); - free_huge_page(page); /* add to the hugepage allocator */ + free_huge_folio(folio); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ free_gigantic_folio(folio, huge_page_order(h)); @@ -3264,7 +3262,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) &node_states[N_MEMORY], NULL); if (!folio) break; - free_huge_page(&folio->page); /* free it into the hugepage allocator */ + free_huge_folio(folio); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3542,7 +3540,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid, while (count > persistent_huge_pages(h)) { /* * If this allocation races such that we no longer need the - * page, free_huge_page will handle it by freeing the page + * page, free_huge_folio will handle it by freeing the page * and reducing the surplus. */ spin_unlock_irq(&hugetlb_lock); @@ -3658,7 +3656,7 @@ static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio) prep_compound_page(subpage, target_hstate->order); folio_change_private(inner_folio, NULL); prep_new_hugetlb_folio(target_hstate, inner_folio, nid); - free_huge_page(subpage); + free_huge_folio(inner_folio); } mutex_unlock(&target_hstate->resize_lock); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74484859336038..30dc444436cc6c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -610,7 +610,7 @@ void destroy_large_folio(struct folio *folio) enum compound_dtor_id dtor = folio->_folio_dtor; if (folio_test_hugetlb(folio)) { - free_huge_page(&folio->page); + free_huge_folio(folio); return; } From 8dc4a8f1e038189cb575f89bcd23364698b88cc1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:52 +0100 Subject: [PATCH 409/489] mm: convert free_transhuge_folio() to folio_undo_large_rmappable() Indirect calls are expensive, thanks to Spectre. Test for TRANSHUGE_PAGE_DTOR and destroy the folio appropriately. Move the free_compound_page() call into destroy_large_folio() to simplify later patches. Link: https://lkml.kernel.org/r/20230816151201.3655946-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 2 -- include/linux/mm.h | 2 -- mm/huge_memory.c | 22 +++++++++++----------- mm/internal.h | 2 ++ mm/page_alloc.c | 9 ++++++--- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e718dbe928bae2..ceda26a208306a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -141,8 +141,6 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); -void free_transhuge_page(struct page *page); - bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) diff --git a/include/linux/mm.h b/include/linux/mm.h index 55eb2789794e72..0d14e204565847 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1253,9 +1253,7 @@ enum compound_dtor_id { #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE TRANSHUGE_PAGE_DTOR, -#endif NR_COMPOUND_DTORS, }; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 154c210892a120..b33456683b935f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2776,10 +2776,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) return ret; } -void free_transhuge_page(struct page *page) +void folio_undo_large_rmappable(struct folio *folio) { - struct folio *folio = (struct folio *)page; - struct deferred_split *ds_queue = get_deferred_split_queue(folio); + struct deferred_split *ds_queue; unsigned long flags; /* @@ -2787,15 +2786,16 @@ void free_transhuge_page(struct page *page) * deferred_list. If folio is not in deferred_list, it's safe * to check without acquiring the split_queue_lock. */ - if (data_race(!list_empty(&folio->_deferred_list))) { - spin_lock_irqsave(&ds_queue->split_queue_lock, flags); - if (!list_empty(&folio->_deferred_list)) { - ds_queue->split_queue_len--; - list_del(&folio->_deferred_list); - } - spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); + if (data_race(list_empty(&folio->_deferred_list))) + return; + + ds_queue = get_deferred_split_queue(folio); + spin_lock_irqsave(&ds_queue->split_queue_lock, flags); + if (!list_empty(&folio->_deferred_list)) { + ds_queue->split_queue_len--; + list_del(&folio->_deferred_list); } - free_compound_page(page); + spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags); } void deferred_split_folio(struct folio *folio) diff --git a/mm/internal.h b/mm/internal.h index d99ffb473f904a..30bbfcacc90964 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -413,6 +413,8 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) #endif } +void folio_undo_large_rmappable(struct folio *folio); + static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30dc444436cc6c..4047b58974430a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -287,9 +287,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = { static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { [NULL_COMPOUND_DTOR] = NULL, [COMPOUND_PAGE_DTOR] = free_compound_page, -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, -#endif }; int min_free_kbytes = 1024; @@ -614,6 +611,12 @@ void destroy_large_folio(struct folio *folio) return; } + if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) { + folio_undo_large_rmappable(folio); + free_compound_page(&folio->page); + return; + } + VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); compound_page_dtors[dtor](&folio->page); } From da6e7bf3a0315025e4199d599bd31763f0df3b4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:53 +0100 Subject: [PATCH 410/489] mm: convert prep_transhuge_page() to folio_prep_large_rmappable() Match folio_undo_large_rmappable(), and move the casting from page to folio into the callers (which they were largely doing anyway). Link: https://lkml.kernel.org/r/20230816151201.3655946-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- mm/huge_memory.c | 4 +--- mm/khugepaged.c | 2 +- mm/mempolicy.c | 15 ++++++++------- mm/page_alloc.c | 7 ++++--- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ceda26a208306a..fa0350b0812ab8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -140,7 +140,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -void prep_transhuge_page(struct page *page); +void folio_prep_large_rmappable(struct folio *folio); bool can_split_folio(struct folio *folio, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) @@ -280,7 +280,7 @@ static inline bool hugepage_vma_check(struct vm_area_struct *vma, return false; } -static inline void prep_transhuge_page(struct page *page) {} +static inline void folio_prep_large_rmappable(struct folio *folio) {} #define transparent_hugepage_flags 0UL diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b33456683b935f..5817bf77f1f07c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -577,10 +577,8 @@ struct deferred_split *get_deferred_split_queue(struct folio *folio) } #endif -void prep_transhuge_page(struct page *page) +void folio_prep_large_rmappable(struct folio *folio) { - struct folio *folio = (struct folio *)page; - VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9a6e0d50775939..40d43eccdee866 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -896,7 +896,7 @@ static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node, return false; } - prep_transhuge_page(*hpage); + folio_prep_large_rmappable((struct folio *)*hpage); count_vm_event(THP_COLLAPSE_ALLOC); return true; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index ec2eaceffd74b1..42b5567e37738d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2195,9 +2195,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, mpol_cond_put(pol); gfp |= __GFP_COMP; page = alloc_page_interleave(gfp, order, nid); - if (page && order > 1) - prep_transhuge_page(page); folio = (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); goto out; } @@ -2208,9 +2208,9 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, gfp |= __GFP_COMP; page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); - if (page && order > 1) - prep_transhuge_page(page); folio = (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); goto out; } @@ -2306,10 +2306,11 @@ EXPORT_SYMBOL(alloc_pages); struct folio *folio_alloc(gfp_t gfp, unsigned order) { struct page *page = alloc_pages(gfp | __GFP_COMP, order); + struct folio *folio = (struct folio *)page; - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); + return folio; } EXPORT_SYMBOL(folio_alloc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4047b58974430a..a97d6fa9cea0b2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4489,10 +4489,11 @@ struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, { struct page *page = __alloc_pages(gfp | __GFP_COMP, order, preferred_nid, nodemask); + struct folio *folio = (struct folio *)page; - if (page && order > 1) - prep_transhuge_page(page); - return (struct folio *)page; + if (folio && order > 1) + folio_prep_large_rmappable(folio); + return folio; } EXPORT_SYMBOL(__folio_alloc); From 0f2f43fabb95192c73b19586ef7536d7ac7c2f8c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:54 +0100 Subject: [PATCH 411/489] mm: remove free_compound_page() and the compound_page_dtors array The only remaining destructor is free_compound_page(). Inline it into destroy_large_folio() and remove the array it used to live in. Link: https://lkml.kernel.org/r/20230816151201.3655946-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 ---------- mm/page_alloc.c | 24 +++++------------------- 2 files changed, 5 insertions(+), 29 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d14e204565847..0955b6b13fd0f0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,14 +1239,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -/* - * Compound pages have a destructor function. Provide a - * prototype for that function and accessor functions. - * These are _only_ valid on the head of a compound page. - */ -typedef void compound_page_dtor(struct page *); - -/* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ enum compound_dtor_id { NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, @@ -1299,8 +1291,6 @@ static inline unsigned long thp_size(struct page *page) return PAGE_SIZE << thp_order(page); } -void free_compound_page(struct page *page); - #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a97d6fa9cea0b2..31fec31be31efe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -284,11 +284,6 @@ const char * const migratetype_names[MIGRATE_TYPES] = { #endif }; -static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { - [NULL_COMPOUND_DTOR] = NULL, - [COMPOUND_PAGE_DTOR] = free_compound_page, -}; - int min_free_kbytes = 1024; int user_min_free_kbytes = -1; static int watermark_boost_factor __read_mostly = 15000; @@ -577,19 +572,13 @@ static inline void free_the_page(struct page *page, unsigned int order) * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * The first tail page's ->compound_dtor holds the offset in array of compound - * page destructors. See compound_page_dtors. + * The first tail page's ->compound_dtor describes how to destroy the + * compound page. * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ -void free_compound_page(struct page *page) -{ - mem_cgroup_uncharge(page_folio(page)); - free_the_page(page, compound_order(page)); -} - void prep_compound_page(struct page *page, unsigned int order) { int i; @@ -611,14 +600,11 @@ void destroy_large_folio(struct folio *folio) return; } - if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) { + if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) folio_undo_large_rmappable(folio); - free_compound_page(&folio->page); - return; - } - VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); - compound_page_dtors[dtor](&folio->page); + mem_cgroup_uncharge(folio); + free_the_page(&folio->page, folio_order(folio)); } static inline void set_buddy_order(struct page *page, unsigned int order) From 9c5ccf2db04b8d7c3df363fdd4856c2b79ab2c6a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:55 +0100 Subject: [PATCH 412/489] mm: remove HUGETLB_PAGE_DTOR We can use a bit in page[1].flags to indicate that this folio belongs to hugetlb instead of using a value in page[1].dtors. That lets folio_test_hugetlb() become an inline function like it should be. We can also get rid of NULL_COMPOUND_DTOR. Link: https://lkml.kernel.org/r/20230816151201.3655946-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- .../admin-guide/kdump/vmcoreinfo.rst | 10 +--- include/linux/mm.h | 4 -- include/linux/page-flags.h | 43 ++++++++++++---- kernel/crash_core.c | 2 +- mm/hugetlb.c | 49 +++---------------- mm/page_alloc.c | 2 +- 6 files changed, 43 insertions(+), 67 deletions(-) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index c18d94fa647048..baa1c355741d15 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -325,8 +325,8 @@ NR_FREE_PAGES On linux-2.6.21 or later, the number of free pages is in vm_stat[NR_FREE_PAGES]. Used to get the number of free pages. -PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask ------------------------------------------------------------------------------- +PG_lru|PG_private|PG_swapcache|PG_swapbacked|PG_slab|PG_hwpoision|PG_head_mask|PG_hugetlb +----------------------------------------------------------------------------------------- Page attributes. These flags are used to filter various unnecessary for dumping pages. @@ -338,12 +338,6 @@ More page attributes. These flags are used to filter various unnecessary for dumping pages. -HUGETLB_PAGE_DTOR ------------------ - -The HUGETLB_PAGE_DTOR flag denotes hugetlbfs pages. Makedumpfile -excludes these pages. - x86_64 ====== diff --git a/include/linux/mm.h b/include/linux/mm.h index 0955b6b13fd0f0..e241f5ee4dc4c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1240,11 +1240,7 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); enum compound_dtor_id { - NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, -#ifdef CONFIG_HUGETLB_PAGE - HUGETLB_PAGE_DTOR, -#endif TRANSHUGE_PAGE_DTOR, NR_COMPOUND_DTORS, }; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 9218028caf337e..e9e7cc45352d43 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,15 +171,6 @@ enum pageflags { /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, -#ifdef CONFIG_MEMORY_FAILURE - /* - * Compound pages. Stored in first tail page's flags. - * Indicates that at least one subpage is hwpoisoned in the - * THP. - */ - PG_has_hwpoisoned = PG_error, -#endif - /* non-lru isolated movable page */ PG_isolated = PG_reclaim, @@ -190,6 +181,15 @@ enum pageflags { /* For self-hosted memmap pages */ PG_vmemmap_self_hosted = PG_owner_priv_1, #endif + + /* + * Flags only valid for compound pages. Stored in first tail page's + * flags word. + */ + + /* At least one page in this folio has the hwpoison flag set */ + PG_has_hwpoisoned = PG_error, + PG_hugetlb = PG_active, }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -812,7 +812,23 @@ static inline void ClearPageCompound(struct page *page) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); -bool folio_test_hugetlb(struct folio *folio); +SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) +CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) + +/** + * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs + * @folio: The folio to test. + * + * Context: Any context. Caller should have a reference on the folio to + * prevent it from being turned into a tail page. + * Return: True for hugetlbfs folios, false for anon folios or folios + * belonging to other filesystems. + */ +static inline bool folio_test_hugetlb(struct folio *folio) +{ + return folio_test_large(folio) && + test_bit(PG_hugetlb, folio_flags(folio, 1)); +} #else TESTPAGEFLAG_FALSE(Huge, hugetlb) #endif @@ -1056,6 +1072,13 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) #define PAGE_FLAGS_CHECK_AT_PREP \ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) +/* + * Flags stored in the second page of a compound page. They may overlap + * the CHECK_AT_FREE flags above, so need to be cleared. + */ +#define PAGE_FLAGS_SECOND \ + (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb) + #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 90ce1dfd591c3c..dd5f87047d068c 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -490,7 +490,7 @@ static int __init crash_save_vmcoreinfo_init(void) #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); + VMCOREINFO_NUMBER(PG_hugetlb); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6a3c80026ab3cd..a82c3104337e88 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1585,25 +1585,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h, { lockdep_assert_held(&hugetlb_lock); - /* - * Very subtle - * - * For non-gigantic pages set the destructor to the normal compound - * page dtor. This is needed in case someone takes an additional - * temporary ref to the page, and freeing is delayed until they drop - * their reference. - * - * For gigantic pages set the destructor to the null dtor. This - * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_folio will turn the folio into a - * simple group of pages. After this the destructor does not - * apply. - * - */ - if (hstate_is_gigantic(h)) - folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); - else - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); + folio_clear_hugetlb(folio); } /* @@ -1690,7 +1672,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, h->surplus_huge_pages_node[nid]++; } - folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above @@ -1814,9 +1796,8 @@ static void free_hpage_workfn(struct work_struct *work) /* * The VM_BUG_ON_FOLIO(!folio_test_hugetlb(folio), folio) in * folio_hstate() is going to trigger because a previous call to - * remove_hugetlb_folio() will call folio_set_compound_dtor - * (folio, NULL_COMPOUND_DTOR), so do not use folio_hstate() - * directly. + * remove_hugetlb_folio() will clear the hugetlb bit, so do + * not use folio_hstate() directly. */ h = size_to_hstate(page_size(page)); @@ -1955,7 +1936,7 @@ static void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_compound_dtor(folio, HUGETLB_PAGE_DTOR); + folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -2070,28 +2051,10 @@ int PageHuge(struct page *page) if (!PageCompound(page)) return 0; folio = page_folio(page); - return folio->_folio_dtor == HUGETLB_PAGE_DTOR; + return folio_test_hugetlb(folio); } EXPORT_SYMBOL_GPL(PageHuge); -/** - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs - * @folio: The folio to test. - * - * Context: Any context. Caller should have a reference on the folio to - * prevent it from being turned into a tail page. - * Return: True for hugetlbfs folios, false for anon folios or folios - * belonging to other filesystems. - */ -bool folio_test_hugetlb(struct folio *folio) -{ - if (!folio_test_large(folio)) - return false; - - return folio->_folio_dtor == HUGETLB_PAGE_DTOR; -} -EXPORT_SYMBOL_GPL(folio_test_hugetlb); - /* * Find and lock address space (mapping) in write mode. * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 31fec31be31efe..d96dc6a3077a6d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1112,7 +1112,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); if (compound) - ClearPageHasHWPoisoned(page); + page[1].flags &= ~PAGE_FLAGS_SECOND; for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_page_prepare(page, page + i); From de53c05f2ae3d47d30db58e9c4e54e3bbc868377 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:56 +0100 Subject: [PATCH 413/489] mm: add large_rmappable page flag Stored in the first tail page's flags, this flag replaces the destructor. That removes the last of the destructors, so remove all references to folio_dtor and compound_dtor. Link: https://lkml.kernel.org/r/20230816151201.3655946-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 4 ++-- include/linux/mm.h | 13 ------------- include/linux/mm_types.h | 2 -- include/linux/page-flags.h | 7 ++++++- kernel/crash_core.c | 1 - mm/huge_memory.c | 4 ++-- mm/internal.h | 1 - mm/page_alloc.c | 7 +------ 8 files changed, 11 insertions(+), 28 deletions(-) diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index baa1c355741d15..3bd38ac0e7de74 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -141,8 +141,8 @@ nodemask_t The size of a nodemask_t type. Used to compute the number of online nodes. -(page, flags|_refcount|mapping|lru|_mapcount|private|compound_dtor|compound_order|compound_head) -------------------------------------------------------------------------------------------------- +(page, flags|_refcount|mapping|lru|_mapcount|private|compound_order|compound_head) +---------------------------------------------------------------------------------- User-space tools compute their values based on the offset of these variables. The variables are used when excluding unnecessary pages. diff --git a/include/linux/mm.h b/include/linux/mm.h index e241f5ee4dc4c4..63d5737819735d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1239,19 +1239,6 @@ void folio_copy(struct folio *dst, struct folio *src); unsigned long nr_free_buffer_pages(void); -enum compound_dtor_id { - COMPOUND_PAGE_DTOR, - TRANSHUGE_PAGE_DTOR, - NR_COMPOUND_DTORS, -}; - -static inline void folio_set_compound_dtor(struct folio *folio, - enum compound_dtor_id compound_dtor) -{ - VM_BUG_ON_FOLIO(compound_dtor >= NR_COMPOUND_DTORS, folio); - folio->_folio_dtor = compound_dtor; -} - void destroy_large_folio(struct folio *folio); /* Returns the number of bytes in this potentially compound page. */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index de5ac95572c818..1c5c2349c18ea1 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,7 +264,6 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_folio_dtor: Which destructor to use for this folio. * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). @@ -318,7 +317,6 @@ struct folio { unsigned long _flags_1; unsigned long _head_1; /* public: */ - unsigned char _folio_dtor; unsigned char _folio_order; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e9e7cc45352d43..85d54b6c9e0b69 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -190,6 +190,7 @@ enum pageflags { /* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, PG_hugetlb = PG_active, + PG_large_rmappable = PG_workingset, /* anon or file-backed */ }; #define PAGEFLAGS_MASK ((1UL << NR_PAGEFLAGS) - 1) @@ -806,6 +807,9 @@ static inline void ClearPageCompound(struct page *page) BUG_ON(!PageHead(page)); ClearPageHead(page); } +PAGEFLAG(LargeRmappable, large_rmappable, PF_SECOND) +#else +TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable) #endif #define PG_head_mask ((1UL << PG_head)) @@ -1077,7 +1081,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ - (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb) + (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb | \ + 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index dd5f87047d068c..934dd86e19f594 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(folio, _folio_dtor); VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 5817bf77f1f07c..2fe1ea187b6b45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -581,7 +581,7 @@ void folio_prep_large_rmappable(struct folio *folio) { VM_BUG_ON_FOLIO(folio_order(folio) < 2, folio); INIT_LIST_HEAD(&folio->_deferred_list); - folio_set_compound_dtor(folio, TRANSHUGE_PAGE_DTOR); + folio_set_large_rmappable(folio); } static inline bool is_transparent_hugepage(struct page *page) @@ -593,7 +593,7 @@ static inline bool is_transparent_hugepage(struct page *page) folio = page_folio(page); return is_huge_zero_page(&folio->page) || - folio->_folio_dtor == TRANSHUGE_PAGE_DTOR; + folio_test_large_rmappable(folio); } static unsigned long __thp_get_unmapped_area(struct file *filp, diff --git a/mm/internal.h b/mm/internal.h index 30bbfcacc90964..5c0daea731f34b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -419,7 +419,6 @@ static inline void prep_compound_head(struct page *page, unsigned int order) { struct folio *folio = (struct folio *)page; - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); folio_set_order(folio, order); atomic_set(&folio->_entire_mapcount, -1); atomic_set(&folio->_nr_pages_mapped, 0); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d96dc6a3077a6d..452459836b7118 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -572,9 +572,6 @@ static inline void free_the_page(struct page *page, unsigned int order) * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded * in bit 0 of page->compound_head. The rest of bits is pointer to head page. * - * The first tail page's ->compound_dtor describes how to destroy the - * compound page. - * * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ @@ -593,14 +590,12 @@ void prep_compound_page(struct page *page, unsigned int order) void destroy_large_folio(struct folio *folio) { - enum compound_dtor_id dtor = folio->_folio_dtor; - if (folio_test_hugetlb(folio)) { free_huge_folio(folio); return; } - if (folio_test_transhuge(folio) && dtor == TRANSHUGE_PAGE_DTOR) + if (folio_test_large_rmappable(folio)) folio_undo_large_rmappable(folio); mem_cgroup_uncharge(folio); From c704ae9797843402436190a6f094a035237fd012 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:57 +0100 Subject: [PATCH 414/489] mm: rearrange page flags Move PG_writeback into bottom byte so that it can use PG_waiters in a later patch. Move PG_head into bottom byte as well to match with where 'order' is moving next. PG_active and PG_workingset move into the second byte to make room for them. By putting PG_head in bit 6, we ensure that it is cleared by assigning the folio order to the bottom byte of the first tail page (since the order cannot be larger than 63). Link: https://lkml.kernel.org/r/20230816151201.3655946-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 85d54b6c9e0b69..46fc05c648ff51 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -99,13 +99,15 @@ */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ + PG_writeback, /* Page is under writeback */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, + PG_head, /* Must be in bit 6 */ + PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_active, PG_workingset, - PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ @@ -113,8 +115,6 @@ enum pageflags { PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ - PG_writeback, /* Page is under writeback */ - PG_head, /* A head page */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ From ebc1baf5c9b46c2240c580a2fd992b2e48606dfa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:58 +0100 Subject: [PATCH 415/489] mm: free up a word in the first tail page Store the folio order in the low byte of the flags word in the first tail page. This frees up the word that was being used to store the order and dtor bytes previously. Link: https://lkml.kernel.org/r/20230816151201.3655946-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm.h | 10 +++++----- include/linux/mm_types.h | 3 +-- include/linux/page-flags.h | 7 ++++--- kernel/crash_core.c | 1 - mm/internal.h | 2 +- 5 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 63d5737819735d..939386e0aedab9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1000,7 +1000,7 @@ struct inode; * compound_order() can be called without holding a reference, which means * that niceties like page_folio() don't work. These callers should be * prepared to handle wild return values. For example, PG_head may be - * set before _folio_order is initialised, or this may be a tail page. + * set before the order is initialised, or this may be a tail page. * See compaction.c for some good examples. */ static inline unsigned int compound_order(struct page *page) @@ -1009,7 +1009,7 @@ static inline unsigned int compound_order(struct page *page) if (!test_bit(PG_head, &folio->flags)) return 0; - return folio->_folio_order; + return folio->_flags_1 & 0xff; } /** @@ -1025,7 +1025,7 @@ static inline unsigned int folio_order(struct folio *folio) { if (!folio_test_large(folio)) return 0; - return folio->_folio_order; + return folio->_flags_1 & 0xff; } #include @@ -1996,7 +1996,7 @@ static inline long folio_nr_pages(struct folio *folio) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << folio->_folio_order; + return 1L << (folio->_flags_1 & 0xff); #endif } @@ -2014,7 +2014,7 @@ static inline unsigned long compound_nr(struct page *page) #ifdef CONFIG_64BIT return folio->_folio_nr_pages; #else - return 1L << folio->_folio_order; + return 1L << (folio->_flags_1 & 0xff); #endif } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c5c2349c18ea1..40afb3bbc309db 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -264,7 +264,6 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. - * @_folio_order: Do not use directly, call folio_order(). * @_entire_mapcount: Do not use directly, call folio_entire_mapcount(). * @_nr_pages_mapped: Do not use directly, call folio_mapcount(). * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). @@ -316,8 +315,8 @@ struct folio { struct { unsigned long _flags_1; unsigned long _head_1; + unsigned long _folio_avail; /* public: */ - unsigned char _folio_order; atomic_t _entire_mapcount; atomic_t _nr_pages_mapped; atomic_t _pincount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 46fc05c648ff51..638b0a96b4c590 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -184,7 +184,8 @@ enum pageflags { /* * Flags only valid for compound pages. Stored in first tail page's - * flags word. + * flags word. Cannot use the first 8 flags or any flag marked as + * PF_ANY. */ /* At least one page in this folio has the hwpoison flag set */ @@ -1081,8 +1082,8 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) * the CHECK_AT_FREE flags above, so need to be cleared. */ #define PAGE_FLAGS_SECOND \ - (1UL << PG_has_hwpoisoned | 1UL << PG_hugetlb | \ - 1UL << PG_large_rmappable) + (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ + 1UL << PG_hugetlb | 1UL << PG_large_rmappable) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 934dd86e19f594..693445e1f7f6fd 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -455,7 +455,6 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(folio, _folio_order); VMCOREINFO_OFFSET(page, compound_head); VMCOREINFO_OFFSET(pglist_data, node_zones); VMCOREINFO_OFFSET(pglist_data, nr_zones); diff --git a/mm/internal.h b/mm/internal.h index 5c0daea731f34b..d1d4bf4e63c088 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -407,7 +407,7 @@ static inline void folio_set_order(struct folio *folio, unsigned int order) if (WARN_ON_ONCE(!order || !folio_test_large(folio))) return; - folio->_folio_order = order; + folio->_flags_1 = (folio->_flags_1 & ~0xffUL) | order; #ifdef CONFIG_64BIT folio->_folio_nr_pages = 1U << order; #endif From 6199277baf73a4877b42991b97c40e173e530756 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:11:59 +0100 Subject: [PATCH 416/489] mm: remove folio_test_transhuge() This function is misleading; people think it means "Is this a THP", when all it actually does is check whether this is a large folio. Remove it; the one remaining user should have been checking to see whether the folio is PMD sized or not. Link: https://lkml.kernel.org/r/20230816151201.3655946-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 5 ----- mm/memcontrol.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 638b0a96b4c590..5c02720c53a584 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -853,11 +853,6 @@ static inline int PageTransHuge(struct page *page) return PageHead(page); } -static inline bool folio_test_transhuge(struct folio *folio) -{ - return folio_test_head(folio); -} - /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8e125aa5a18dca..de6b40f851130d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5755,7 +5755,7 @@ static int mem_cgroup_move_account(struct page *page, if (folio_mapped(folio)) { __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); - if (folio_test_transhuge(folio)) { + if (folio_test_pmd_mappable(folio)) { __mod_lruvec_state(from_vec, NR_ANON_THPS, -nr_pages); __mod_lruvec_state(to_vec, NR_ANON_THPS, From b10ff04dc0ec7cc7dbb0eac98c4202ec8d28c21b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:12:00 +0100 Subject: [PATCH 417/489] mm: add tail private fields to struct folio Because THP_SWAP uses page->private for each page, we must not use the space which overlaps that field for anything which would conflict with that. We avoid the conflict on 32-bit systems by disallowing THP_SWAP on 32-bit. Link: https://lkml.kernel.org/r/20230816151201.3655946-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 40afb3bbc309db..06e9315bfe7cf5 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,8 +322,11 @@ struct folio { atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; -#endif + /* 4 byte gap here */ /* private: the union with struct page is transitional */ + /* Fix THP_SWAP to not use tail->private */ + unsigned long _private_1; +#endif }; struct page __page_1; }; @@ -344,6 +347,9 @@ struct folio { /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ + unsigned long _avail_2a; + /* Fix THP_SWAP to not use tail->private */ + unsigned long _private_2a; }; struct page __page_2; }; @@ -368,12 +374,18 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); +#ifdef CONFIG_64BIT +FOLIO_MATCH(private, _private_1); +#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ offsetof(struct page, pg) + 2 * sizeof(struct page)) FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); +FOLIO_MATCH(flags, _flags_2a); +FOLIO_MATCH(compound_head, _head_2a); +FOLIO_MATCH(private, _private_2a); #undef FOLIO_MATCH /** From a644b0abbfe1d7cf775082cafdcc7b5f3c35becf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 16 Aug 2023 16:12:01 +0100 Subject: [PATCH 418/489] mm: convert split_huge_pages_pid() to use a folio Replaces five calls to compound_head with one. Link: https://lkml.kernel.org/r/20230816151201.3655946-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Jens Axboe Cc: Sidhartha Kumar Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/huge_memory.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2fe1ea187b6b45..213bb1e3383094 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -584,14 +584,11 @@ void folio_prep_large_rmappable(struct folio *folio) folio_set_large_rmappable(folio); } -static inline bool is_transparent_hugepage(struct page *page) +static inline bool is_transparent_hugepage(struct folio *folio) { - struct folio *folio; - - if (!PageCompound(page)) + if (!folio_test_large(folio)) return false; - folio = page_folio(page); return is_huge_zero_page(&folio->page) || folio_test_large_rmappable(folio); } @@ -3012,6 +3009,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) { struct vm_area_struct *vma = vma_lookup(mm, addr); struct page *page; + struct folio *folio; if (!vma) break; @@ -3028,22 +3026,23 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, if (IS_ERR_OR_NULL(page)) continue; - if (!is_transparent_hugepage(page)) + folio = page_folio(page); + if (!is_transparent_hugepage(folio)) goto next; total++; - if (!can_split_folio(page_folio(page), NULL)) + if (!can_split_folio(folio, NULL)) goto next; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto next; - if (!split_huge_page(page)) + if (!split_folio(folio)) split++; - unlock_page(page); + folio_unlock(folio); next: - put_page(page); + folio_put(folio); cond_resched(); } mmap_read_unlock(mm); From 6c1419730822fe991fc15bfd7059f6872a71a7af Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 22 Aug 2023 15:30:43 -0700 Subject: [PATCH 419/489] hugetlb: clear flags in tail pages that will be freed individually hugetlb manually creates and destroys compound pages. As such it makes assumptions about struct page layout. Commit ebc1baf5c9b4 ("mm: free up a word in the first tail page") breaks hugetlb. The following will fix the breakage. Link: https://lkml.kernel.org/r/20230822231741.GC4509@monkey Fixes: ebc1baf5c9b4 ("mm: free up a word in the first tail page") Signed-off-by: Mike Kravetz Cc: Jens Axboe Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a82c3104337e88..cbc25826c9b04c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1484,6 +1484,7 @@ static void __destroy_compound_gigantic_folio(struct folio *folio, for (i = 1; i < nr_pages; i++) { p = folio_page(folio, i); + p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE; p->mapping = NULL; clear_compound_head(p); if (!demote) @@ -1702,8 +1703,6 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, static void __update_and_free_hugetlb_folio(struct hstate *h, struct folio *folio) { - int i; - struct page *subpage; bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) @@ -1745,14 +1744,6 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, spin_unlock_irq(&hugetlb_lock); } - for (i = 0; i < pages_per_huge_page(h); i++) { - subpage = folio_page(folio, i); - subpage->flags &= ~(1 << PG_locked | 1 << PG_error | - 1 << PG_referenced | 1 << PG_dirty | - 1 << PG_active | 1 << PG_private | - 1 << PG_writeback); - } - /* * Non-gigantic pages demoted from CMA allocated gigantic pages * need to be given back to CMA in free_gigantic_folio. From a98460494b16db9c377e55bc13e5407a0eb79fe8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Aug 2023 12:51:20 -0700 Subject: [PATCH 420/489] mm/khugepaged: fix collapse_pte_mapped_thp() versus uffd Jann Horn demonstrated how userfaultfd ioctl UFFDIO_COPY into a private shmem mapping can add valid PTEs to page table collapse_pte_mapped_thp() thought it had emptied: page lock on the huge page is enough to protect against WP faults (which find the PTE has been cleared), but not enough to protect against userfaultfd. "BUG: Bad rss-counter state" followed. retract_page_tables() protects against this by checking !vma->anon_vma; but we know that MADV_COLLAPSE needs to be able to work on private shmem mappings, even those with an anon_vma prepared for another part of the mapping; and we know that MADV_COLLAPSE needs to work on shared shmem mappings which are userfaultfd_armed(). Whether it needs to work on private shmem mappings which are userfaultfd_armed(), I'm not so sure: but assume that it does. Just for this case, take the pmd_lock() two steps earlier: not because it gives any protection against this case itself, but because ptlock nests inside it, and it's the dropping of ptlock which let the bug in. In other cases, continue to minimize the pmd_lock() hold time. Link: https://lkml.kernel.org/r/4d31abf5-56c0-9f3d-d12f-c9317936691@google.com Fixes: 1043173eb5eb ("mm/khugepaged: collapse_pte_mapped_thp() with mmap_read_lock()") Signed-off-by: Hugh Dickins Reported-by: Jann Horn Closes: https://lore.kernel.org/linux-mm/CAG48ez0FxiRC4d3VTu_a9h=rg5FW-kYD5Rg5xo_RDBM0LTTqZQ@mail.gmail.com/ Acked-by: Peter Xu Cc: David Hildenbrand Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/khugepaged.c | 38 +++++++++++++++++++++++++++++--------- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40d43eccdee866..d5650541083a03 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1476,7 +1476,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd, pgt_pmd; - spinlock_t *pml, *ptl; + spinlock_t *pml = NULL, *ptl; int nr_ptes = 0, result = SCAN_FAIL; int i; @@ -1572,9 +1572,25 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, haddr, haddr + HPAGE_PMD_SIZE); mmu_notifier_invalidate_range_start(&range); notified = true; - start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + + /* + * pmd_lock covers a wider range than ptl, and (if split from mm's + * page_table_lock) ptl nests inside pml. The less time we hold pml, + * the better; but userfaultfd's mfill_atomic_pte() on a private VMA + * inserts a valid as-if-COWed PTE without even looking up page cache. + * So page lock of hpage does not protect from it, so we must not drop + * ptl before pgt_pmd is removed, so uffd private needs pml taken now. + */ + if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) + pml = pmd_lock(mm, pmd); + + start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; + if (!pml) + spin_lock(ptl); + else if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; @@ -1608,7 +1624,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, nr_ptes++; } - pte_unmap_unlock(start_pte, ptl); + pte_unmap(start_pte); + if (!pml) + spin_unlock(ptl); /* step 3: set proper refcount and mm_counters. */ if (nr_ptes) { @@ -1616,12 +1634,12 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, add_mm_counter(mm, mm_counter_file(hpage), -nr_ptes); } - /* step 4: remove page table */ - - /* Huge page lock is still held, so page table must remain empty */ - pml = pmd_lock(mm, pmd); - if (ptl != pml) - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + /* step 4: remove empty page table */ + if (!pml) { + pml = pmd_lock(mm, pmd); + if (ptl != pml) + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); if (ptl != pml) @@ -1648,6 +1666,8 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, } if (start_pte) pte_unmap_unlock(start_pte, ptl); + if (pml && pml != ptl) + spin_unlock(pml); if (notified) mmu_notifier_invalidate_range_end(&range); drop_hpage: From 08dff2810e8feb3096bf5c8242ab1649d1e8b1a4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 12 Aug 2023 16:56:25 +0100 Subject: [PATCH 421/489] mm/memory.c: fix mismerge Fix a build issue. Link: https://lkml.kernel.org/r/ZNerqcNS4EBJA/2v@casper.infradead.org Fixes: 4aaa60dad4d1 ("mm: allow per-VMA locks on file-backed VMAs") Signed-off-by: Matthew Wilcox Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202308121909.XNYBtqNI-lkp@intel.com/ Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 4a7c8be9fe71f2..f9c3ad48982310 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5457,7 +5457,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (unlikely(!vma->anon_vma && !vma_is_tcp(vma))) + if (vma_is_anonymous(vma) && !vma->anon_vma) goto inval_end_read; /* From d51b68469bc7804c34622f7f3d4889628d37cfd6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Sat, 1 Jul 2023 15:28:37 +0800 Subject: [PATCH 422/489] mm: memory-failure: fix potential page refcnt leak in memory_failure() put_ref_page() is not called to drop extra refcnt when comes from madvise in the case pfn is valid but pgmap is NULL leading to page refcnt leak. Link: https://lkml.kernel.org/r/20230701072837.1994253-1-linmiaohe@huawei.com Fixes: 1e8aaedb182d ("mm,memory_failure: always pin the page in madvise_inject_error") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 55dfe8a7bf4bf9..881c35ef1daa88 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2117,8 +2117,6 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, { int rc = -ENXIO; - put_ref_page(pfn, flags); - /* device metadata space is not recoverable */ if (!pgmap_pfn_valid(pgmap, pfn)) goto out; @@ -2193,6 +2191,7 @@ int memory_failure(unsigned long pfn, int flags) if (pfn_valid(pfn)) { pgmap = get_dev_pagemap(pfn, NULL); + put_ref_page(pfn, flags); if (pgmap) { res = memory_failure_dev_pagemap(pfn, flags, pgmap); From b243dcbf2f13856e39e18df3a15a65f6fe33db85 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:52 -0700 Subject: [PATCH 423/489] swap: remove remnants of polling from read_swap_cache_async Patch series "Per-VMA lock support for swap and userfaults", v7. When per-VMA locks were introduced in [1] several types of page faults would still fall back to mmap_lock to keep the patchset simple. Among them are swap and userfault pages. The main reason for skipping those cases was the fact that mmap_lock could be dropped while handling these faults and that required additional logic to be implemented. Implement the mechanism to allow per-VMA locks to be dropped for these cases. First, change handle_mm_fault to drop per-VMA locks when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED to be consistent with the way mmap_lock is handled. Then change folio_lock_or_retry to accept vm_fault and return vm_fault_t which simplifies later patches. Finally allow swap and uffd page faults to be handled under per-VMA locks by dropping per-VMA and retrying, the same way it's done under mmap_lock. Naturally, once VMA lock is dropped that VMA should be assumed unstable and can't be used. This patch (of 6): Commit [1] introduced IO polling support duding swapin to reduce swap read latency for block devices that can be polled. However later commit [2] removed polling support. Therefore it seems safe to remove do_poll parameter in read_swap_cache_async and always call swap_readpage with synchronous=false waiting for IO completion in folio_lock_or_retry. [1] commit 23955622ff8d ("swap: add block io poll in swapin path") [2] commit 9650b453a3d4 ("block: ignore RWF_HIPRI hint for sync dio") Link: https://lkml.kernel.org/r/20230630211957.1341547-1-surenb@google.com Link: https://lkml.kernel.org/r/20230630211957.1341547-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Reviewed-by: Christoph Hellwig Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Peter Xu Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/madvise.c | 4 ++-- mm/swap.h | 1 - mm/swap_state.c | 12 +++++------- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index b1f53a95e3a53c..4dded5d27e7eaa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -217,7 +217,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, ptep = NULL; page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, - vma, addr, false, &splug); + vma, addr, &splug); if (page) put_page(page); } @@ -262,7 +262,7 @@ static void shmem_swapin_range(struct vm_area_struct *vma, rcu_read_unlock(); page = read_swap_cache_async(entry, mapping_gfp_mask(mapping), - vma, addr, false, &splug); + vma, addr, &splug); if (page) put_page(page); diff --git a/mm/swap.h b/mm/swap.h index 7c033d793f15f5..8a3c7a0ace4f0c 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -46,7 +46,6 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, - bool do_poll, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, diff --git a/mm/swap_state.c b/mm/swap_state.c index d157862ba0a698..01f15139b7d9e5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -526,15 +526,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, - unsigned long addr, bool do_poll, - struct swap_iocb **plug) + unsigned long addr, struct swap_iocb **plug) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) - swap_readpage(retpage, do_poll, plug); + swap_readpage(retpage, false, plug); return retpage; } @@ -629,7 +628,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; struct swap_iocb *splug = NULL; - bool do_poll = true, page_allocated; + bool page_allocated; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; @@ -637,7 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!mask) goto skip; - do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; @@ -669,7 +667,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll, NULL); + return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -837,7 +835,7 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, skip: /* The page was likely read above, so no need for plugging here */ return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - ra_info.win == 1, NULL); + NULL); } /** From 7a32b58be9bab8f0440a4af526bfb1269e5affdb Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:53 -0700 Subject: [PATCH 424/489] mm: add missing VM_FAULT_RESULT_TRACE name for VM_FAULT_COMPLETED VM_FAULT_RESULT_TRACE should contain an element for every vm_fault_reason to be used as flag_array inside trace_print_flags_seq(). The element for VM_FAULT_COMPLETED is missing, add it. Link: https://lkml.kernel.org/r/20230630211957.1341547-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 06e9315bfe7cf5..2b9d8be28361e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1184,7 +1184,8 @@ enum vm_fault_reason { { VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_DONE_COW, "DONE_COW" }, \ - { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" } + { VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }, \ + { VM_FAULT_COMPLETED, "COMPLETED" } struct vm_special_mapping { const char *name; /* The name, e.g. "[vdso]". */ From 4089eef0e6ac1a179c58304c657b3df3bb6fe509 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:54 -0700 Subject: [PATCH 425/489] mm: drop per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED handle_mm_fault returning VM_FAULT_RETRY or VM_FAULT_COMPLETED means mmap_lock has been released. However with per-VMA locks behavior is different and the caller should still release it. To make the rules consistent for the caller, drop the per-VMA lock when returning VM_FAULT_RETRY or VM_FAULT_COMPLETED. Currently the only path returning VM_FAULT_RETRY under per-VMA locks is do_swap_page and no path returns VM_FAULT_COMPLETED for now. [willy@infradead.org: fix riscv] Link: https://lkml.kernel.org/r/CAJuCfpE6GWEx1rPBmNpUfoD5o-gNFz9-UFywzCE2PbEGBiVz7g@mail.gmail.com Link: https://lkml.kernel.org/r/20230630211957.1341547-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Tested-by: Conor Dooley Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- arch/arm64/mm/fault.c | 3 ++- arch/powerpc/mm/fault.c | 3 ++- arch/riscv/mm/fault.c | 3 ++- arch/s390/mm/fault.c | 3 ++- arch/x86/mm/fault.c | 3 ++- mm/memory.c | 12 ++++++++++++ 6 files changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 103fcbdc65526f..2e5d1e238af958 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -599,7 +599,8 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, goto lock_mmap; } fault = handle_mm_fault(vma, addr, mm_flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index fafce6bdeff0fd..b1723094d464cf 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -488,7 +488,8 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c index 046732fcb48ca3..6115d751497200 100644 --- a/arch/riscv/mm/fault.c +++ b/arch/riscv/mm/fault.c @@ -296,7 +296,8 @@ void handle_page_fault(struct pt_regs *regs) } fault = handle_mm_fault(vma, addr, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 6f6b9881e55e6c..a063774ba58433 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -417,7 +417,8 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); if (likely(!(fault & VM_FAULT_ERROR))) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 787da09d24f3fb..2e861b9360c75e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1340,7 +1340,8 @@ void do_user_addr_fault(struct pt_regs *regs, goto lock_mmap; } fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); - vma_end_read(vma); + if (!(fault & (VM_FAULT_RETRY | VM_FAULT_COMPLETED))) + vma_end_read(vma); if (!(fault & VM_FAULT_RETRY)) { count_vm_vma_lock_event(VMA_LOCK_SUCCESS); diff --git a/mm/memory.c b/mm/memory.c index f9c3ad48982310..b9c3780fd426f6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3747,6 +3747,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ret = VM_FAULT_RETRY; + vma_end_read(vma); goto out; } @@ -5248,6 +5249,17 @@ static vm_fault_t sanitize_fault_flags(struct vm_area_struct *vma, !is_cow_mapping(vma->vm_flags))) return VM_FAULT_SIGSEGV; } +#ifdef CONFIG_PER_VMA_LOCK + /* + * Per-VMA locks can't be used with FAULT_FLAG_RETRY_NOWAIT because of + * the assumption that lock is dropped on VM_FAULT_RETRY. + */ + if (WARN_ON_ONCE((*flags & + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT)) == + (FAULT_FLAG_VMA_LOCK | FAULT_FLAG_RETRY_NOWAIT))) + return VM_FAULT_SIGSEGV; +#endif + return 0; } From fdc724d6aa44efd75cc9b6a3c3900baac44bc50a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:55 -0700 Subject: [PATCH 426/489] mm: change folio_lock_or_retry to use vm_fault directly Change folio_lock_or_retry to accept vm_fault struct and return the vm_fault_t directly. Link: https://lkml.kernel.org/r/20230630211957.1341547-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Matthew Wilcox Acked-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 11 ++++++----- mm/filemap.c | 22 ++++++++++++---------- mm/memory.c | 14 ++++++-------- 3 files changed, 24 insertions(+), 23 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index f4f24b594cd7ac..437e4526028c22 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -916,8 +916,7 @@ static inline bool wake_page_match(struct wait_page_queue *wait_page, void __folio_lock(struct folio *folio); int __folio_lock_killable(struct folio *folio); -bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, - unsigned int flags); +vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf); void unlock_page(struct page *page); void folio_unlock(struct folio *folio); @@ -1021,11 +1020,13 @@ static inline int folio_lock_killable(struct folio *folio) * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool folio_lock_or_retry(struct folio *folio, - struct mm_struct *mm, unsigned int flags) +static inline vm_fault_t folio_lock_or_retry(struct folio *folio, + struct vm_fault *vmf) { might_sleep(); - return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); + if (!folio_trylock(folio)) + return __folio_lock_or_retry(folio, vmf); + return 0; } /* diff --git a/mm/filemap.c b/mm/filemap.c index dd022b065614ba..40514493014a15 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1669,32 +1669,34 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) /* * Return values: - * true - folio is locked; mmap_lock is still held. - * false - folio is not locked. + * 0 - folio is locked. + * non-zero - folio is not locked. * mmap_lock has been released (mmap_read_unlock(), unless flags had both * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in * which case mmap_lock is still held. * - * If neither ALLOW_RETRY nor KILLABLE are set, will always return true + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 * with the folio locked and the mmap_lock unperturbed. */ -bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, - unsigned int flags) +vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { + struct mm_struct *mm = vmf->vma->vm_mm; + unsigned int flags = vmf->flags; + if (fault_flag_allow_retry_first(flags)) { /* * CAUTION! In this case, mmap_lock is not released - * even though return 0. + * even though return VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) - return false; + return VM_FAULT_RETRY; mmap_read_unlock(mm); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else folio_wait_locked(folio); - return false; + return VM_FAULT_RETRY; } if (flags & FAULT_FLAG_KILLABLE) { bool ret; @@ -1702,13 +1704,13 @@ bool __folio_lock_or_retry(struct folio *folio, struct mm_struct *mm, ret = __folio_lock_killable(folio); if (ret) { mmap_read_unlock(mm); - return false; + return VM_FAULT_RETRY; } } else { __folio_lock(folio); } - return true; + return 0; } /** diff --git a/mm/memory.c b/mm/memory.c index b9c3780fd426f6..080e1d59d752e8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3599,6 +3599,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; + vm_fault_t ret; /* * We need a reference to lock the folio because we don't hold @@ -3611,9 +3612,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) if (!folio_try_get(folio)) return 0; - if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) { + ret = folio_lock_or_retry(folio, vmf); + if (ret) { folio_put(folio); - return VM_FAULT_RETRY; + return ret; } mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma->vm_mm, vmf->address & PAGE_MASK, @@ -3738,7 +3740,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) bool exclusive = false; swp_entry_t entry; pte_t pte; - int locked; vm_fault_t ret = 0; void *shadow = NULL; @@ -3861,12 +3862,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); - - if (!locked) { - ret |= VM_FAULT_RETRY; + ret |= folio_lock_or_retry(folio, vmf); + if (ret & VM_FAULT_RETRY) goto out_release; - } if (swapcache) { /* From 1235ccd05b6dd6970ff50baea99aa994023fbc4a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:56 -0700 Subject: [PATCH 427/489] mm: handle swap page faults under per-VMA lock When page fault is handled under per-VMA lock protection, all swap page faults are retried with mmap_lock because folio_lock_or_retry has to drop and reacquire mmap_lock if folio could not be immediately locked. Follow the same pattern as mmap_lock to drop per-VMA lock when waiting for folio and retrying once folio is available. With this obstacle removed, enable do_swap_page to operate under per-VMA lock protection. Drivers implementing ops->migrate_to_ram might still rely on mmap_lock, therefore we have to fall back to mmap_lock in that particular case. Note that the only time do_swap_page calls synchronous swap_readpage is when SWP_SYNCHRONOUS_IO is set, which is only set for QUEUE_FLAG_SYNCHRONOUS devices: brd, zram and nvdimms (both btt and pmem). Therefore we don't sleep in this path, and there's no need to drop the mmap or per-VMA lock. Link: https://lkml.kernel.org/r/20230630211957.1341547-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Tested-by: Alistair Popple Reviewed-by: Alistair Popple Acked-by: Peter Xu Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++++ mm/filemap.c | 17 ++++++++--------- mm/memory.c | 16 ++++++++++------ 3 files changed, 31 insertions(+), 15 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 939386e0aedab9..0d16208178c7b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,6 +729,14 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) vma->detached = detached; } +static inline void release_fault_lock(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_end_read(vmf->vma); + else + mmap_read_unlock(vmf->vma->vm_mm); +} + struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -749,6 +757,11 @@ static inline struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, return NULL; } +static inline void release_fault_lock(struct vm_fault *vmf) +{ + mmap_read_unlock(vmf->vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/mm/filemap.c b/mm/filemap.c index 40514493014a15..8040545954bc41 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1671,27 +1671,26 @@ static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait) * Return values: * 0 - folio is locked. * non-zero - folio is not locked. - * mmap_lock has been released (mmap_read_unlock(), unless flags had both - * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in - * which case mmap_lock is still held. + * mmap_lock or per-VMA lock has been released (mmap_read_unlock() or + * vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and + * FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held. * * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0 - * with the folio locked and the mmap_lock unperturbed. + * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed. */ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) { - struct mm_struct *mm = vmf->vma->vm_mm; unsigned int flags = vmf->flags; if (fault_flag_allow_retry_first(flags)) { /* - * CAUTION! In this case, mmap_lock is not released - * even though return VM_FAULT_RETRY. + * CAUTION! In this case, mmap_lock/per-VMA lock is not + * released even though returning VM_FAULT_RETRY. */ if (flags & FAULT_FLAG_RETRY_NOWAIT) return VM_FAULT_RETRY; - mmap_read_unlock(mm); + release_fault_lock(vmf); if (flags & FAULT_FLAG_KILLABLE) folio_wait_locked_killable(folio); else @@ -1703,7 +1702,7 @@ vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf) ret = __folio_lock_killable(folio); if (ret) { - mmap_read_unlock(mm); + release_fault_lock(vmf); return VM_FAULT_RETRY; } } else { diff --git a/mm/memory.c b/mm/memory.c index 080e1d59d752e8..5748a41c164c5b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3746,12 +3746,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (!pte_unmap_same(vmf)) goto out; - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - ret = VM_FAULT_RETRY; - vma_end_read(vma); - goto out; - } - entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { @@ -3761,6 +3755,16 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->page = pfn_swap_entry_to_page(entry); ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { + if (vmf->flags & FAULT_FLAG_VMA_LOCK) { + /* + * migrate_to_ram is not yet ready to operate + * under VMA lock. + */ + vma_end_read(vma); + ret = VM_FAULT_RETRY; + goto out; + } + vmf->page = pfn_swap_entry_to_page(entry); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); From 29a22b9e08d70d6c9b075c12c47b6e895cb65cf0 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Fri, 30 Jun 2023 14:19:57 -0700 Subject: [PATCH 428/489] mm: handle userfaults under VMA lock Enable handle_userfault to operate under VMA lock by releasing VMA lock instead of mmap_lock and retrying. Note that FAULT_FLAG_RETRY_NOWAIT should never be used when handling faults under per-VMA lock protection because that would break the assumption that lock is dropped on retry. [surenb@google.com: fix a lockdep issue in vma_assert_write_locked] Link: https://lkml.kernel.org/r/20230712195652.969194-1-surenb@google.com Link: https://lkml.kernel.org/r/20230630211957.1341547-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Peter Xu Cc: Alistair Popple Cc: Al Viro Cc: Christian Brauner Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: Hillf Danton Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jan Kara Cc: Johannes Weiner Cc: Josef Bacik Cc: Laurent Dufour Cc: Liam R. Howlett Cc: Lorenzo Stoakes Cc: Matthew Wilcox Cc: Michal Hocko Cc: Michel Lespinasse Cc: Minchan Kim Cc: Pavel Tatashin Cc: Punit Agrawal Cc: Vlastimil Babka Cc: Yu Zhao Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 34 ++++++++++++++-------------------- include/linux/mm.h | 20 ++++++++++++++++++++ mm/memory.c | 9 +-------- 3 files changed, 35 insertions(+), 28 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 70bd2951b68d62..1091cb46174743 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -277,17 +277,16 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * hugepmd ranges. */ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { + struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; bool ret = true; - mmap_assert_locked(ctx->mm); + assert_fault_locked(vmf); - ptep = hugetlb_walk(vma, address, vma_mmu_pagesize(vma)); + ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) goto out; @@ -308,10 +307,8 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, - struct vm_area_struct *vma, - unsigned long address, - unsigned long flags, - unsigned long reason) + struct vm_fault *vmf, + unsigned long reason) { return false; /* should never get here */ } @@ -325,11 +322,11 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, * threads. */ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, - unsigned long address, - unsigned long flags, + struct vm_fault *vmf, unsigned long reason) { struct mm_struct *mm = ctx->mm; + unsigned long address = vmf->address; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -338,7 +335,7 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pte_t ptent; bool ret = true; - mmap_assert_locked(mm); + assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -440,7 +437,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * Coredumping runs without mmap_lock so we can only check that * the mmap_lock is held, if PF_DUMPCORE was not set. */ - mmap_assert_locked(mm); + assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; if (!ctx) @@ -556,15 +553,12 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) spin_unlock_irq(&ctx->fault_pending_wqh.lock); if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, - reason); + must_wait = userfaultfd_must_wait(ctx, vmf, reason); else - must_wait = userfaultfd_huge_must_wait(ctx, vma, - vmf->address, - vmf->flags, reason); + must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); - mmap_read_unlock(mm); + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { wake_up_poll(&ctx->fd_wqh, EPOLLIN); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0d16208178c7b6..c1db400e83cb0f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -679,6 +679,7 @@ static inline void vma_end_read(struct vm_area_struct *vma) rcu_read_unlock(); } +/* WARNING! Can only be used if mmap_lock is expected to be write-locked */ static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) { mmap_assert_write_locked(vma->vm_mm); @@ -721,6 +722,12 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); } +static inline void vma_assert_locked(struct vm_area_struct *vma) +{ + if (!rwsem_is_locked(&vma->vm_lock->lock)) + vma_assert_write_locked(vma); +} + static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) { /* When detaching vma should be write-locked */ @@ -737,6 +744,14 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + if (vmf->flags & FAULT_FLAG_VMA_LOCK) + vma_assert_locked(vmf->vma); + else + mmap_assert_locked(vmf->vma->vm_mm); +} + struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, unsigned long address); @@ -762,6 +777,11 @@ static inline void release_fault_lock(struct vm_fault *vmf) mmap_read_unlock(vmf->vma->vm_mm); } +static inline void assert_fault_locked(struct vm_fault *vmf) +{ + mmap_assert_locked(vmf->vma->vm_mm); +} + #endif /* CONFIG_PER_VMA_LOCK */ extern const struct vm_operations_struct vma_dummy_vm_ops; diff --git a/mm/memory.c b/mm/memory.c index 5748a41c164c5b..2c6f45d18b73f5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5471,14 +5471,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, * concurrent mremap() with MREMAP_DONTUNMAP could dissociate the VMA * from its anon_vma. */ - if (vma_is_anonymous(vma) && !vma->anon_vma) - goto inval_end_read; - - /* - * Due to the possibility of userfault handler dropping mmap_lock, avoid - * it for now and fall back to page fault handling under mmap_lock. - */ - if (userfaultfd_armed(vma)) + if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) goto inval_end_read; /* Check since vm_start/vm_end might change before we lock the VMA */ From f82e6bf9bb9b6e1dc001320a88eee67d7ac31e96 Mon Sep 17 00:00:00 2001 From: Yosry Ahmed Date: Wed, 26 Jul 2023 15:32:23 +0000 Subject: [PATCH 429/489] mm: memcg: use rstat for non-hierarchical stats Currently, memcg uses rstat to maintain aggregated hierarchical stats. Counters are maintained for hierarchical stats at each memcg. Rstat tracks which cgroups have updates on which cpus to keep those counters fresh on the read-side. Non-hierarchical stats are currently not covered by rstat. Their per-cpu counters are summed up on every read, which is expensive. The original implementation did the same. At some point before rstat, non-hierarchical aggregated counters were introduced by commit a983b5ebee57 ("mm: memcontrol: fix excessive complexity in memory.stat reporting"). However, those counters were updated on the performance critical write-side, which caused regressions, so they were later removed by commit 815744d75152 ("mm: memcontrol: don't batch updates of local VM stats and events"). See [1] for more detailed history. Kernel versions in between a983b5ebee57 & 815744d75152 (a year and a half) enjoyed cheap reads of non-hierarchical stats, specifically on cgroup v1. When moving to more recent kernels, a performance regression for reading non-hierarchical stats is observed. Now that we have rstat, we know exactly which percpu counters have updates for each stat. We can maintain non-hierarchical counters again, making reads much more efficient, without affecting the performance critical write-side. Hence, add non-hierarchical (i.e local) counters for the stats, and extend rstat flushing to keep those up-to-date. A caveat is that we now need a stats flush before reading local/non-hierarchical stats through {memcg/lruvec}_page_state_local() or memcg_events_local(), where we previously only needed a flush to read hierarchical stats. Most contexts reading non-hierarchical stats are already doing a flush, add a flush to the only missing context in count_shadow_nodes(). With this patch, reading memory.stat from 1000 memcgs is 3x faster on a machine with 256 cpus on cgroup v1: # for i in $(seq 1000); do mkdir /sys/fs/cgroup/memory/cg$i; done # time cat /sys/fs/cgroup/memory/cg*/memory.stat > /dev/null real 0m0.125s user 0m0.005s sys 0m0.120s After: real 0m0.032s user 0m0.005s sys 0m0.027s To make sure there are no regressions on cgroup v2, I ran an artificial reclaim/refault stress test [2] that creates (NR_CPUS * 2) cgroups, assigns them limits, runs a worker process in each cgroup that allocates tmpfs memory equal to quadruple the limit (to invoke reclaim continuously), and then reads back the entire file (to invoke refaults). All workers are run in parallel, and zram is used as a swapping backend. Both reclaim and refault have conditional stats flushing. I ran this on a machine with 112 cpus, once on mm-unstable, and once on mm-unstable with this patch reverted. (1) A few runs without this patch: # time ./stress_reclaim_refault.sh real 0m9.949s user 0m0.496s sys 14m44.974s # time ./stress_reclaim_refault.sh real 0m10.049s user 0m0.486s sys 14m55.791s # time ./stress_reclaim_refault.sh real 0m9.984s user 0m0.481s sys 14m53.841s (2) A few runs with this patch: # time ./stress_reclaim_refault.sh real 0m9.885s user 0m0.486s sys 14m48.753s # time ./stress_reclaim_refault.sh real 0m9.903s user 0m0.495s sys 14m48.339s # time ./stress_reclaim_refault.sh real 0m9.861s user 0m0.507s sys 14m49.317s No regressions are observed with this patch. There is actually a very slight improvement. If I have to guess, maybe it's because we avoid the percpu loop in count_shadow_nodes() when calling lruvec_page_state_local(), but I could not prove this using perf, it's probably in the noise. [1] https://lore.kernel.org/lkml/20230725201811.GA1231514@cmpxchg.org/ [2] https://lore.kernel.org/lkml/CAJD7tkb17x=qwoO37uxyYXLEUVp15BQKR+Xfh7Sg9Hx-wTQ_=w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230803185046.1385770-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20230726153223.821757-2-yosryahmed@google.com Signed-off-by: Yosry Ahmed Acked-by: Johannes Weiner Acked-by: Roman Gushchin Acked-by: Michal Hocko Cc: Muchun Song Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 7 ++-- mm/memcontrol.c | 67 +++++++++++++++++++++----------------- mm/workingset.c | 1 + 3 files changed, 43 insertions(+), 32 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 163004ae334966..11810a2cfd2d05 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -111,6 +111,9 @@ struct lruvec_stats { /* Aggregated (CPU and subtree) state */ long state[NR_VM_NODE_STAT_ITEMS]; + /* Non-hierarchical (CPU aggregated) state */ + long state_local[NR_VM_NODE_STAT_ITEMS]; + /* Pending child counts during tree propagation */ long state_pending[NR_VM_NODE_STAT_ITEMS]; }; @@ -1018,14 +1021,12 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, { struct mem_cgroup_per_node *pn; long x = 0; - int cpu; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu); + x = READ_ONCE(pn->lruvec_stats.state_local[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index de6b40f851130d..cf57fe9318d55e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -742,6 +742,10 @@ struct memcg_vmstats { long state[MEMCG_NR_STAT]; unsigned long events[NR_MEMCG_EVENTS]; + /* Non-hierarchical (CPU aggregated) page state & events */ + long state_local[MEMCG_NR_STAT]; + unsigned long events_local[NR_MEMCG_EVENTS]; + /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; unsigned long events_pending[NR_MEMCG_EVENTS]; @@ -775,11 +779,8 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) /* idx can be of type enum memcg_stat_item or node_stat_item. */ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { - long x = 0; - int cpu; + long x = READ_ONCE(memcg->vmstats->state_local[idx]); - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->state[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; @@ -926,16 +927,12 @@ static unsigned long memcg_events(struct mem_cgroup *memcg, int event) static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { - long x = 0; - int cpu; int index = memcg_events_index(event); if (index < 0) return 0; - for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[index], cpu); - return x; + return READ_ONCE(memcg->vmstats->events_local[index]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -5516,7 +5513,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = parent_mem_cgroup(memcg); struct memcg_vmstats_percpu *statc; - long delta, v; + long delta, delta_cpu, v; int i, nid; statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); @@ -5532,19 +5529,23 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ + delta_cpu = 0; v = READ_ONCE(statc->state[i]); if (v != statc->state_prev[i]) { - delta += v - statc->state_prev[i]; + delta_cpu = v - statc->state_prev[i]; + delta += delta_cpu; statc->state_prev[i] = v; } - if (!delta) - continue; - /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats->state[i] += delta; - if (parent) - parent->vmstats->state_pending[i] += delta; + if (delta_cpu) + memcg->vmstats->state_local[i] += delta_cpu; + + if (delta) { + memcg->vmstats->state[i] += delta; + if (parent) + parent->vmstats->state_pending[i] += delta; + } } for (i = 0; i < NR_MEMCG_EVENTS; i++) { @@ -5552,18 +5553,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (delta) memcg->vmstats->events_pending[i] = 0; + delta_cpu = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { - delta += v - statc->events_prev[i]; + delta_cpu = v - statc->events_prev[i]; + delta += delta_cpu; statc->events_prev[i] = v; } - if (!delta) - continue; + if (delta_cpu) + memcg->vmstats->events_local[i] += delta_cpu; - memcg->vmstats->events[i] += delta; - if (parent) - parent->vmstats->events_pending[i] += delta; + if (delta) { + memcg->vmstats->events[i] += delta; + if (parent) + parent->vmstats->events_pending[i] += delta; + } } for_each_node_state(nid, N_MEMORY) { @@ -5581,18 +5586,22 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (delta) pn->lruvec_stats.state_pending[i] = 0; + delta_cpu = 0; v = READ_ONCE(lstatc->state[i]); if (v != lstatc->state_prev[i]) { - delta += v - lstatc->state_prev[i]; + delta_cpu = v - lstatc->state_prev[i]; + delta += delta_cpu; lstatc->state_prev[i] = v; } - if (!delta) - continue; + if (delta_cpu) + pn->lruvec_stats.state_local[i] += delta_cpu; - pn->lruvec_stats.state[i] += delta; - if (ppn) - ppn->lruvec_stats.state_pending[i] += delta; + if (delta) { + pn->lruvec_stats.state[i] += delta; + if (ppn) + ppn->lruvec_stats.state_pending[i] += delta; + } } } } diff --git a/mm/workingset.c b/mm/workingset.c index 4686ae363000af..da58a26d0d4d76 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -664,6 +664,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct lruvec *lruvec; int i; + mem_cgroup_flush_stats(); lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid)); for (pages = 0, i = 0; i < NR_LRU_LISTS; i++) pages += lruvec_page_state_local(lruvec, From f9bff0e31881d03badf191d3b0005839391f5f2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:29 +0100 Subject: [PATCH 430/489] minmax: add in_range() macro Patch series "New page table range API", v6. This patchset changes the API used by the MM to set up page table entries. The four APIs are: set_ptes(mm, addr, ptep, pte, nr) update_mmu_cache_range(vma, addr, ptep, nr) flush_dcache_folio(folio) flush_icache_pages(vma, page, nr) flush_dcache_folio() isn't technically new, but no architecture implemented it, so I've done that for them. The old APIs remain around but are mostly implemented by calling the new interfaces. The new APIs are based around setting up N page table entries at once. The N entries belong to the same PMD, the same folio and the same VMA, so ptep++ is a legitimate operation, and locking is taken care of for you. Some architectures can do a better job of it than just a loop, but I have hesitated to make too deep a change to architectures I don't understand well. One thing I have changed in every architecture is that PG_arch_1 is now a per-folio bit instead of a per-page bit when used for dcache clean/dirty tracking. This was something that would have to happen eventually, and it makes sense to do it now rather than iterate over every page involved in a cache flush and figure out if it needs to happen. The point of all this is better performance, and Fengwei Yin has measured improvement on x86. I suspect you'll see improvement on your architecture too. Try the new will-it-scale test mentioned here: https://lore.kernel.org/linux-mm/20230206140639.538867-5-fengwei.yin@intel.com/ You'll need to run it on an XFS filesystem and have CONFIG_TRANSPARENT_HUGEPAGE set. This patchset is the basis for much of the anonymous large folio work being done by Ryan, so it's received quite a lot of testing over the last few months. This patch (of 38): Determine if a value lies within a range more efficiently (subtraction + comparison vs two comparisons and an AND). It also has useful (under some circumstances) behaviour if the range exceeds the maximum value of the type. Convert all the conflicting definitions of in_range() within the kernel; some can use the generic definition while others need their own definition. Link: https://lkml.kernel.org/r/20230802151406.3735276-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/pageattr.c | 6 ++--- .../drm/arm/display/include/malidp_utils.h | 2 +- .../display/komeda/komeda_pipeline_state.c | 24 ++++++++--------- drivers/gpu/drm/msm/adreno/a6xx_gmu.c | 6 ----- .../net/ethernet/chelsio/cxgb3/cxgb3_main.c | 18 ++++++------- drivers/virt/acrn/ioreq.c | 4 +-- fs/btrfs/misc.h | 2 -- fs/ext2/balloc.c | 2 -- fs/ext4/ext4.h | 2 -- fs/ufs/util.h | 6 ----- include/linux/minmax.h | 27 +++++++++++++++++++ lib/logic_pio.c | 3 --- net/netfilter/nf_nat_core.c | 6 ++--- net/tipc/core.h | 2 +- net/tipc/link.c | 10 +++---- .../selftests/bpf/progs/get_branch_snapshot.c | 4 +-- 16 files changed, 65 insertions(+), 59 deletions(-) diff --git a/arch/arm/mm/pageattr.c b/arch/arm/mm/pageattr.c index c3c34fe714b06f..064ad508c149d4 100644 --- a/arch/arm/mm/pageattr.c +++ b/arch/arm/mm/pageattr.c @@ -25,7 +25,7 @@ static int change_page_range(pte_t *ptep, unsigned long addr, void *data) return 0; } -static bool in_range(unsigned long start, unsigned long size, +static bool range_in_range(unsigned long start, unsigned long size, unsigned long range_start, unsigned long range_end) { return start >= range_start && start < range_end && @@ -63,8 +63,8 @@ static int change_memory_common(unsigned long addr, int numpages, if (!size) return 0; - if (!in_range(start, size, MODULES_VADDR, MODULES_END) && - !in_range(start, size, VMALLOC_START, VMALLOC_END)) + if (!range_in_range(start, size, MODULES_VADDR, MODULES_END) && + !range_in_range(start, size, VMALLOC_START, VMALLOC_END)) return -EINVAL; return __change_memory_common(start, size, set_mask, clear_mask); diff --git a/drivers/gpu/drm/arm/display/include/malidp_utils.h b/drivers/gpu/drm/arm/display/include/malidp_utils.h index 49a1d7f3539c28..9f83baac6ed871 100644 --- a/drivers/gpu/drm/arm/display/include/malidp_utils.h +++ b/drivers/gpu/drm/arm/display/include/malidp_utils.h @@ -35,7 +35,7 @@ static inline void set_range(struct malidp_range *rg, u32 start, u32 end) rg->end = end; } -static inline bool in_range(struct malidp_range *rg, u32 v) +static inline bool malidp_in_range(struct malidp_range *rg, u32 v) { return (v >= rg->start) && (v <= rg->end); } diff --git a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c index 3276a3e82c628e..4618687a8f4d64 100644 --- a/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c +++ b/drivers/gpu/drm/arm/display/komeda/komeda_pipeline_state.c @@ -305,12 +305,12 @@ komeda_layer_check_cfg(struct komeda_layer *layer, if (komeda_fb_check_src_coords(kfb, src_x, src_y, src_w, src_h)) return -EINVAL; - if (!in_range(&layer->hsize_in, src_w)) { + if (!malidp_in_range(&layer->hsize_in, src_w)) { DRM_DEBUG_ATOMIC("invalidate src_w %d.\n", src_w); return -EINVAL; } - if (!in_range(&layer->vsize_in, src_h)) { + if (!malidp_in_range(&layer->vsize_in, src_h)) { DRM_DEBUG_ATOMIC("invalidate src_h %d.\n", src_h); return -EINVAL; } @@ -452,14 +452,14 @@ komeda_scaler_check_cfg(struct komeda_scaler *scaler, hsize_out = dflow->out_w; vsize_out = dflow->out_h; - if (!in_range(&scaler->hsize, hsize_in) || - !in_range(&scaler->hsize, hsize_out)) { + if (!malidp_in_range(&scaler->hsize, hsize_in) || + !malidp_in_range(&scaler->hsize, hsize_out)) { DRM_DEBUG_ATOMIC("Invalid horizontal sizes"); return -EINVAL; } - if (!in_range(&scaler->vsize, vsize_in) || - !in_range(&scaler->vsize, vsize_out)) { + if (!malidp_in_range(&scaler->vsize, vsize_in) || + !malidp_in_range(&scaler->vsize, vsize_out)) { DRM_DEBUG_ATOMIC("Invalid vertical sizes"); return -EINVAL; } @@ -574,13 +574,13 @@ komeda_splitter_validate(struct komeda_splitter *splitter, return -EINVAL; } - if (!in_range(&splitter->hsize, dflow->in_w)) { + if (!malidp_in_range(&splitter->hsize, dflow->in_w)) { DRM_DEBUG_ATOMIC("split in_w:%d is out of the acceptable range.\n", dflow->in_w); return -EINVAL; } - if (!in_range(&splitter->vsize, dflow->in_h)) { + if (!malidp_in_range(&splitter->vsize, dflow->in_h)) { DRM_DEBUG_ATOMIC("split in_h: %d exceeds the acceptable range.\n", dflow->in_h); return -EINVAL; @@ -624,13 +624,13 @@ komeda_merger_validate(struct komeda_merger *merger, return -EINVAL; } - if (!in_range(&merger->hsize_merged, output->out_w)) { + if (!malidp_in_range(&merger->hsize_merged, output->out_w)) { DRM_DEBUG_ATOMIC("merged_w: %d is out of the accepted range.\n", output->out_w); return -EINVAL; } - if (!in_range(&merger->vsize_merged, output->out_h)) { + if (!malidp_in_range(&merger->vsize_merged, output->out_h)) { DRM_DEBUG_ATOMIC("merged_h: %d is out of the accepted range.\n", output->out_h); return -EINVAL; @@ -866,8 +866,8 @@ void komeda_complete_data_flow_cfg(struct komeda_layer *layer, * input/output range. */ if (dflow->en_scaling && scaler) - dflow->en_split = !in_range(&scaler->hsize, dflow->in_w) || - !in_range(&scaler->hsize, dflow->out_w); + dflow->en_split = !malidp_in_range(&scaler->hsize, dflow->in_w) || + !malidp_in_range(&scaler->hsize, dflow->out_w); } static bool merger_is_available(struct komeda_pipeline *pipe, diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c index 5deb79924897af..d90011c813b9ba 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c @@ -676,12 +676,6 @@ struct block_header { u32 data[]; }; -/* this should be a general kernel helper */ -static int in_range(u32 addr, u32 start, u32 size) -{ - return addr >= start && addr < start + size; -} - static bool fw_block_mem(struct a6xx_gmu_bo *bo, const struct block_header *blk) { if (!in_range(blk->addr, bo->iova, bo->size)) diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c index 9b84c8d8d30973..d117022d15d7f7 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c +++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c @@ -2126,7 +2126,7 @@ static const struct ethtool_ops cxgb_ethtool_ops = { .set_link_ksettings = set_link_ksettings, }; -static int in_range(int val, int lo, int hi) +static int cxgb_in_range(int val, int lo, int hi) { return val < 0 || (val <= hi && val >= lo); } @@ -2162,19 +2162,19 @@ static int cxgb_siocdevprivate(struct net_device *dev, return -EINVAL; if (t.qset_idx >= SGE_QSETS) return -EINVAL; - if (!in_range(t.intr_lat, 0, M_NEWTIMER) || - !in_range(t.cong_thres, 0, 255) || - !in_range(t.txq_size[0], MIN_TXQ_ENTRIES, + if (!cxgb_in_range(t.intr_lat, 0, M_NEWTIMER) || + !cxgb_in_range(t.cong_thres, 0, 255) || + !cxgb_in_range(t.txq_size[0], MIN_TXQ_ENTRIES, MAX_TXQ_ENTRIES) || - !in_range(t.txq_size[1], MIN_TXQ_ENTRIES, + !cxgb_in_range(t.txq_size[1], MIN_TXQ_ENTRIES, MAX_TXQ_ENTRIES) || - !in_range(t.txq_size[2], MIN_CTRL_TXQ_ENTRIES, + !cxgb_in_range(t.txq_size[2], MIN_CTRL_TXQ_ENTRIES, MAX_CTRL_TXQ_ENTRIES) || - !in_range(t.fl_size[0], MIN_FL_ENTRIES, + !cxgb_in_range(t.fl_size[0], MIN_FL_ENTRIES, MAX_RX_BUFFERS) || - !in_range(t.fl_size[1], MIN_FL_ENTRIES, + !cxgb_in_range(t.fl_size[1], MIN_FL_ENTRIES, MAX_RX_JUMBO_BUFFERS) || - !in_range(t.rspq_size, MIN_RSPQ_ENTRIES, + !cxgb_in_range(t.rspq_size, MIN_RSPQ_ENTRIES, MAX_RSPQ_ENTRIES)) return -EINVAL; diff --git a/drivers/virt/acrn/ioreq.c b/drivers/virt/acrn/ioreq.c index cecdc1c13af7b6..29e1ef1915fd2e 100644 --- a/drivers/virt/acrn/ioreq.c +++ b/drivers/virt/acrn/ioreq.c @@ -351,7 +351,7 @@ static bool handle_cf8cfc(struct acrn_vm *vm, return is_handled; } -static bool in_range(struct acrn_ioreq_range *range, +static bool acrn_in_range(struct acrn_ioreq_range *range, struct acrn_io_request *req) { bool ret = false; @@ -389,7 +389,7 @@ static struct acrn_ioreq_client *find_ioreq_client(struct acrn_vm *vm, list_for_each_entry(client, &vm->ioreq_clients, list) { read_lock_bh(&client->range_lock); list_for_each_entry(range, &client->range_list, list) { - if (in_range(range, req)) { + if (acrn_in_range(range, req)) { found = client; break; } diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 005751a1291101..40f2d9f1a17a9c 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -8,8 +8,6 @@ #include #include -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. */ diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index eca60b747c6b54..c8049c90323d6c 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -36,8 +36,6 @@ */ -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, unsigned int block_group, struct buffer_head ** bh) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095ef..465db8f8c11dc5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3774,8 +3774,6 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - /* For ioend & aio unwritten conversion wait queues */ #define EXT4_WQ_HASH_SZ 37 #define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 4931bec1a01cad..89247193d96d86 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -11,12 +11,6 @@ #include #include "swab.h" - -/* - * some useful macros - */ -#define in_range(b,first,len) ((b)>=(first)&&(b)<(first)+(len)) - /* * functions used for retyping */ diff --git a/include/linux/minmax.h b/include/linux/minmax.h index 396df1121bffba..4f011eb6533d16 100644 --- a/include/linux/minmax.h +++ b/include/linux/minmax.h @@ -3,6 +3,7 @@ #define _LINUX_MINMAX_H #include +#include /* * min()/max()/clamp() macros must accomplish three things: @@ -158,6 +159,32 @@ */ #define clamp_val(val, lo, hi) clamp_t(typeof(val), val, lo, hi) +static inline bool in_range64(u64 val, u64 start, u64 len) +{ + return (val - start) < len; +} + +static inline bool in_range32(u32 val, u32 start, u32 len) +{ + return (val - start) < len; +} + +/** + * in_range - Determine if a value lies within a range. + * @val: Value to test. + * @start: First value in range. + * @len: Number of values in range. + * + * This is more efficient than "if (start <= val && val < (start + len))". + * It also gives a different answer if @start + @len overflows the size of + * the type by a sufficient amount to encompass @val. Decide for yourself + * which behaviour you want, or prove that start + len never overflow. + * Do not blindly replace one form with the other. + */ +#define in_range(val, start, len) \ + ((sizeof(start) | sizeof(len) | sizeof(val)) <= sizeof(u32) ? \ + in_range32(val, start, len) : in_range64(val, start, len)) + /** * swap - swap values of @a and @b * @a: first value diff --git a/lib/logic_pio.c b/lib/logic_pio.c index 07b4b9a1f54b6b..2ea564a4006442 100644 --- a/lib/logic_pio.c +++ b/lib/logic_pio.c @@ -20,9 +20,6 @@ static LIST_HEAD(io_range_list); static DEFINE_MUTEX(io_range_mutex); -/* Consider a kernel general helper for this */ -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /** * logic_pio_register_range - register logical PIO range for a host * @new_range: pointer to the IO range to be registered. diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index fadbd4ed3dc047..c4e0516a8dfab4 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -327,7 +327,7 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ -static int in_range(const struct nf_conntrack_tuple *tuple, +static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the @@ -376,7 +376,7 @@ find_appropriate_src(struct net *net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; - if (in_range(result, range)) + if (nf_in_range(result, range)) return 1; } } @@ -607,7 +607,7 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ - if (in_range(orig_tuple, range)) { + if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; diff --git a/net/tipc/core.h b/net/tipc/core.h index 0a3f7a70a50a17..7eccd97e060979 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -197,7 +197,7 @@ static inline int less(u16 left, u16 right) return less_eq(left, right) && (mod(right) != mod(left)); } -static inline int in_range(u16 val, u16 min, u16 max) +static inline int tipc_in_range(u16 val, u16 min, u16 max) { return !less(val, min) && !more(val, max); } diff --git a/net/tipc/link.c b/net/tipc/link.c index 2eff1c7949cbcc..e33b4f29f77cf2 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1623,7 +1623,7 @@ static int tipc_link_advance_transmq(struct tipc_link *l, struct tipc_link *r, last_ga->bgack_cnt); } /* Check against the last Gap ACK block */ - if (in_range(seqno, start, end)) + if (tipc_in_range(seqno, start, end)) continue; /* Update/release the packet peer is acking */ bc_has_acked = true; @@ -2251,12 +2251,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, strncpy(if_name, data, TIPC_MAX_IF_NAME); /* Update own tolerance if peer indicates a non-zero value */ - if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { + if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; l->bc_rcvlink->tolerance = peers_tol; } /* Update own priority if peer's priority is higher */ - if (in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) + if (tipc_in_range(peers_prio, l->priority + 1, TIPC_MAX_LINK_PRI)) l->priority = peers_prio; /* If peer is going down we want full re-establish cycle */ @@ -2299,13 +2299,13 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb, l->rcv_nxt_state = msg_seqno(hdr) + 1; /* Update own tolerance if peer indicates a non-zero value */ - if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { + if (tipc_in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL)) { l->tolerance = peers_tol; l->bc_rcvlink->tolerance = peers_tol; } /* Update own prio if peer indicates a different value */ if ((peers_prio != l->priority) && - in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { + tipc_in_range(peers_prio, 1, TIPC_MAX_LINK_PRI)) { l->priority = peers_prio; rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); } diff --git a/tools/testing/selftests/bpf/progs/get_branch_snapshot.c b/tools/testing/selftests/bpf/progs/get_branch_snapshot.c index a1b139888048c9..511ac634eef0ed 100644 --- a/tools/testing/selftests/bpf/progs/get_branch_snapshot.c +++ b/tools/testing/selftests/bpf/progs/get_branch_snapshot.c @@ -15,7 +15,7 @@ long total_entries = 0; #define ENTRY_CNT 32 struct perf_branch_entry entries[ENTRY_CNT] = {}; -static inline bool in_range(__u64 val) +static inline bool gbs_in_range(__u64 val) { return (val >= address_low) && (val < address_high); } @@ -31,7 +31,7 @@ int BPF_PROG(test1, int n, int ret) for (i = 0; i < ENTRY_CNT; i++) { if (i >= total_entries) break; - if (in_range(entries[i].from) && in_range(entries[i].to)) + if (gbs_in_range(entries[i].from) && gbs_in_range(entries[i].to)) test1_hits++; else if (!test1_hits) wasted_entries++; From a379322022c0961fe0b638cdd842d3c38eeff92c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:30 +0100 Subject: [PATCH 431/489] mm: convert page_table_check_pte_set() to page_table_check_ptes_set() Tell the page table check how many PTEs & PFNs we want it to check. Link: https://lkml.kernel.org/r/20230802151406.3735276-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Acked-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- arch/arm64/include/asm/pgtable.h | 2 +- arch/riscv/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- include/linux/page_table_check.h | 13 +++++++------ mm/page_table_check.c | 16 +++++++++------- 5 files changed, 19 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index fe4b913589eed4..445b18d7a47c46 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -348,7 +348,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); return __set_pte_at(mm, addr, ptep, pte); } diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 44377f0d7c35e7..01e4aabc88984e 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -499,7 +499,7 @@ static inline void __set_pte_at(struct mm_struct *mm, static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - page_table_check_pte_set(mm, ptep, pteval); + page_table_check_ptes_set(mm, ptep, pteval, 1); __set_pte_at(mm, addr, ptep, pteval); } diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index ada1bbf1296129..cd0b6337d03cea 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1023,7 +1023,7 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - page_table_check_pte_set(mm, ptep, pte); + page_table_check_ptes_set(mm, ptep, pte, 1); set_pte(ptep, pte); } diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h index 7f6b9bf926c5d8..6722941c7cb8a6 100644 --- a/include/linux/page_table_check.h +++ b/include/linux/page_table_check.h @@ -17,7 +17,8 @@ void __page_table_check_zero(struct page *page, unsigned int order); void __page_table_check_pte_clear(struct mm_struct *mm, pte_t pte); void __page_table_check_pmd_clear(struct mm_struct *mm, pmd_t pmd); void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte); +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd); void __page_table_check_pud_set(struct mm_struct *mm, pud_t *pudp, pud_t pud); void __page_table_check_pte_clear_range(struct mm_struct *mm, @@ -64,13 +65,13 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) __page_table_check_pud_clear(mm, pud); } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { if (static_branch_likely(&page_table_check_disabled)) return; - __page_table_check_pte_set(mm, ptep, pte); + __page_table_check_ptes_set(mm, ptep, pte, nr); } static inline void page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, @@ -123,8 +124,8 @@ static inline void page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) { } -static inline void page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, - pte_t pte) +static inline void page_table_check_ptes_set(struct mm_struct *mm, + pte_t *ptep, pte_t pte, unsigned int nr) { } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 46e77c12c81ecf..af69c3c8f7c2d5 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -182,18 +182,20 @@ void __page_table_check_pud_clear(struct mm_struct *mm, pud_t pud) } EXPORT_SYMBOL(__page_table_check_pud_clear); -void __page_table_check_pte_set(struct mm_struct *mm, pte_t *ptep, pte_t pte) +void __page_table_check_ptes_set(struct mm_struct *mm, pte_t *ptep, pte_t pte, + unsigned int nr) { + unsigned int i; + if (&init_mm == mm) return; - __page_table_check_pte_clear(mm, ptep_get(ptep)); - if (pte_user_accessible_page(pte)) { - page_table_check_set(pte_pfn(pte), PAGE_SIZE >> PAGE_SHIFT, - pte_write(pte)); - } + for (i = 0; i < nr; i++) + __page_table_check_pte_clear(mm, ptep_get(ptep + i)); + if (pte_user_accessible_page(pte)) + page_table_check_set(pte_pfn(pte), nr, pte_write(pte)); } -EXPORT_SYMBOL(__page_table_check_pte_set); +EXPORT_SYMBOL(__page_table_check_ptes_set); void __page_table_check_pmd_set(struct mm_struct *mm, pmd_t *pmdp, pmd_t pmd) { From 3a255267f6dff40e193501cf731f409ce9175503 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:31 +0100 Subject: [PATCH 432/489] mm: add generic flush_icache_pages() and documentation flush_icache_page() is deprecated but not yet removed, so add a range version of it. Change the documentation to refer to update_mmu_cache_range() instead of update_mmu_cache(). Link: https://lkml.kernel.org/r/20230802151406.3735276-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- Documentation/core-api/cachetlb.rst | 39 ++++++++++++++++------------- include/asm-generic/cacheflush.h | 5 ++++ 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 5c0552e78c58fb..b645947954fbbe 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -88,13 +88,17 @@ changes occur: This is used primarily during fault processing. -5) ``void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep)`` +5) ``void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, pte_t *ptep, + unsigned int nr)`` - At the end of every page fault, this routine is invoked to - tell the architecture specific code that a translation - now exists at virtual address "address" for address space - "vma->vm_mm", in the software page tables. + At the end of every page fault, this routine is invoked to tell + the architecture specific code that translations now exists + in the software page tables for address space "vma->vm_mm" + at virtual address "address" for "nr" consecutive pages. + + This routine is also invoked in various other places which pass + a NULL "vmf". A port may use this information in any way it so chooses. For example, it could use this event to pre-load TLB @@ -306,17 +310,18 @@ maps this page at its virtual address. private". The kernel guarantees that, for pagecache pages, it will clear this bit when such a page first enters the pagecache. - This allows these interfaces to be implemented much more efficiently. - It allows one to "defer" (perhaps indefinitely) the actual flush if - there are currently no user processes mapping this page. See sparc64's - flush_dcache_page and update_mmu_cache implementations for an example - of how to go about doing this. + This allows these interfaces to be implemented much more + efficiently. It allows one to "defer" (perhaps indefinitely) the + actual flush if there are currently no user processes mapping this + page. See sparc64's flush_dcache_page and update_mmu_cache_range + implementations for an example of how to go about doing this. - The idea is, first at flush_dcache_page() time, if page_file_mapping() - returns a mapping, and mapping_mapped on that mapping returns %false, - just mark the architecture private page flag bit. Later, in - update_mmu_cache(), a check is made of this flag bit, and if set the - flush is done and the flag bit is cleared. + The idea is, first at flush_dcache_page() time, if + page_file_mapping() returns a mapping, and mapping_mapped on that + mapping returns %false, just mark the architecture private page + flag bit. Later, in update_mmu_cache_range(), a check is made + of this flag bit, and if set the flush is done and the flag bit + is cleared. .. important:: @@ -369,7 +374,7 @@ maps this page at its virtual address. ``void flush_icache_page(struct vm_area_struct *vma, struct page *page)`` All the functionality of flush_icache_page can be implemented in - flush_dcache_page and update_mmu_cache. In the future, the hope + flush_dcache_page and update_mmu_cache_range. In the future, the hope is to remove this interface completely. The final category of APIs is for I/O to deliberately aliased address diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index f46258d1a080f4..09d51a68076595 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -78,6 +78,11 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) #endif #ifndef flush_icache_page +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) +{ +} + static inline void flush_icache_page(struct vm_area_struct *vma, struct page *page) { From bc60abfbe687e886f1c38d49ef2a00e90b4b49cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:32 +0100 Subject: [PATCH 433/489] mm: add folio_flush_mapping() This is the folio equivalent of page_mapping_file(), but rename it to make it clear that it's very different from page_file_mapping(). Theoretically, there's nothing flush-only about it, but there are no other users today, and I doubt there will be; it's almost always more useful to know the swapfile's mapping or the swapcache's mapping. Link: https://lkml.kernel.org/r/20230802151406.3735276-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 437e4526028c22..88d161887cf22d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -389,6 +389,26 @@ static inline struct address_space *folio_file_mapping(struct folio *folio) return folio->mapping; } +/** + * folio_flush_mapping - Find the file mapping this folio belongs to. + * @folio: The folio. + * + * For folios which are in the page cache, return the mapping that this + * page belongs to. Anonymous folios return NULL, even if they're in + * the swap cache. Other kinds of folio also return NULL. + * + * This is ONLY used by architecture cache flushing code. If you aren't + * writing cache flushing code, you want either folio_mapping() or + * folio_file_mapping(). + */ +static inline struct address_space *folio_flush_mapping(struct folio *folio) +{ + if (unlikely(folio_test_swapcache(folio))) + return NULL; + + return folio_mapping(folio); +} + static inline struct address_space *page_file_mapping(struct page *page) { return folio_file_mapping(page_folio(page)); @@ -399,11 +419,7 @@ static inline struct address_space *page_file_mapping(struct page *page) */ static inline struct address_space *page_mapping_file(struct page *page) { - struct folio *folio = page_folio(page); - - if (unlikely(folio_test_swapcache(folio))) - return NULL; - return folio_mapping(folio); + return folio_flush_mapping(page_folio(page)); } /** From 29d26f1215de14721188988a59b1426abb85b7be Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:33 +0100 Subject: [PATCH 434/489] mm: remove ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO Current best practice is to reuse the name of the function as a define to indicate that the function is implemented by the architecture. Link: https://lkml.kernel.org/r/20230802151406.3735276-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- Documentation/core-api/cachetlb.rst | 24 +++++++++--------------- include/linux/cacheflush.h | 4 ++-- mm/util.c | 2 +- 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index b645947954fbbe..889fc84ccd1bd0 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -273,7 +273,7 @@ maps this page at its virtual address. If D-cache aliasing is not an issue, these two routines may simply call memcpy/memset directly and do nothing more. - ``void flush_dcache_page(struct page *page)`` + ``void flush_dcache_folio(struct folio *folio)`` This routines must be called when: @@ -281,7 +281,7 @@ maps this page at its virtual address. and / or in high memory b) the kernel is about to read from a page cache page and user space shared/writable mappings of this page potentially exist. Note - that {get,pin}_user_pages{_fast} already call flush_dcache_page + that {get,pin}_user_pages{_fast} already call flush_dcache_folio on any page found in the user address space and thus driver code rarely needs to take this into account. @@ -295,7 +295,7 @@ maps this page at its virtual address. The phrase "kernel writes to a page cache page" means, specifically, that the kernel executes store instructions that dirty data in that - page at the page->virtual mapping of that page. It is important to + page at the kernel virtual mapping of that page. It is important to flush here to handle D-cache aliasing, to make sure these kernel stores are visible to user space mappings of that page. @@ -306,18 +306,18 @@ maps this page at its virtual address. If D-cache aliasing is not an issue, this routine may simply be defined as a nop on that architecture. - There is a bit set aside in page->flags (PG_arch_1) as "architecture + There is a bit set aside in folio->flags (PG_arch_1) as "architecture private". The kernel guarantees that, for pagecache pages, it will clear this bit when such a page first enters the pagecache. This allows these interfaces to be implemented much more efficiently. It allows one to "defer" (perhaps indefinitely) the actual flush if there are currently no user processes mapping this - page. See sparc64's flush_dcache_page and update_mmu_cache_range + page. See sparc64's flush_dcache_folio and update_mmu_cache_range implementations for an example of how to go about doing this. - The idea is, first at flush_dcache_page() time, if - page_file_mapping() returns a mapping, and mapping_mapped on that + The idea is, first at flush_dcache_folio() time, if + folio_flush_mapping() returns a mapping, and mapping_mapped() on that mapping returns %false, just mark the architecture private page flag bit. Later, in update_mmu_cache_range(), a check is made of this flag bit, and if set the flush is done and the flag bit @@ -331,12 +331,6 @@ maps this page at its virtual address. dirty. Again, see sparc64 for examples of how to deal with this. - ``void flush_dcache_folio(struct folio *folio)`` - This function is called under the same circumstances as - flush_dcache_page(). It allows the architecture to - optimise for flushing the entire folio of pages instead - of flushing one page at a time. - ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long user_vaddr, void *dst, void *src, int len)`` ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page, @@ -357,7 +351,7 @@ maps this page at its virtual address. When the kernel needs to access the contents of an anonymous page, it calls this function (currently only - get_user_pages()). Note: flush_dcache_page() deliberately + get_user_pages()). Note: flush_dcache_folio() deliberately doesn't work for an anonymous page. The default implementation is a nop (and should remain so for all coherent architectures). For incoherent architectures, it should flush @@ -374,7 +368,7 @@ maps this page at its virtual address. ``void flush_icache_page(struct vm_area_struct *vma, struct page *page)`` All the functionality of flush_icache_page can be implemented in - flush_dcache_page and update_mmu_cache_range. In the future, the hope + flush_dcache_folio and update_mmu_cache_range. In the future, the hope is to remove this interface completely. The final category of APIs is for I/O to deliberately aliased address diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h index a6189d21f2ba1a..82136f3fcf54a1 100644 --- a/include/linux/cacheflush.h +++ b/include/linux/cacheflush.h @@ -7,14 +7,14 @@ struct folio; #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +#ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio); #endif #else static inline void flush_dcache_folio(struct folio *folio) { } -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO 0 +#define flush_dcache_folio flush_dcache_folio #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ #endif /* _LINUX_CACHEFLUSH_H */ diff --git a/mm/util.c b/mm/util.c index 5e9305189c3fdc..cde229b05eb351 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1119,7 +1119,7 @@ void page_offline_end(void) } EXPORT_SYMBOL(page_offline_end); -#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO +#ifndef flush_dcache_folio void flush_dcache_folio(struct folio *folio) { long i, nr = folio_nr_pages(folio); From bcc6cc832573a99d1f935c89a28e2c71fd1aaf0c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:34 +0100 Subject: [PATCH 435/489] mm: add default definition of set_ptes() Most architectures can just define set_pte() and PFN_PTE_SHIFT to use this definition. It's also a handy spot to document the guarantees provided by the MM. Link: https://lkml.kernel.org/r/20230802151406.3735276-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Suggested-by: Mike Rapoport (IBM) Reviewed-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 81 ++++++++++++++++++++++++++++++----------- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 6064f454c8e3fb..81c3f7decb1c64 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -182,6 +182,66 @@ static inline int pmd_young(pmd_t pmd) } #endif +/* + * A facility to provide lazy MMU batching. This allows PTE updates and + * page invalidations to be delayed until a call to leave lazy MMU mode + * is issued. Some architectures may benefit from doing this, and it is + * beneficial for both shadow and direct mode hypervisors, which may batch + * the PTE updates which happen during this window. Note that using this + * interface requires that read hazards be removed from the code. A read + * hazard could result in the direct mode hypervisor case, since the actual + * write to the page tables may not yet have taken place, so reads though + * a raw PTE pointer after it has been modified are not guaranteed to be + * up to date. This mode can only be entered and left under the protection of + * the page table locks for all page tables which may be modified. In the UP + * case, this is required so that preemption is disabled, and in the SMP case, + * it must synchronize the delayed page table writes properly on other CPUs. + */ +#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE +#define arch_enter_lazy_mmu_mode() do {} while (0) +#define arch_leave_lazy_mmu_mode() do {} while (0) +#define arch_flush_lazy_mmu_mode() do {} while (0) +#endif + +#ifndef set_ptes +#ifdef PFN_PTE_SHIFT +/** + * set_ptes - Map consecutive pages to a contiguous range of addresses. + * @mm: Address space to map the pages into. + * @addr: Address to map the first page at. + * @ptep: Page table pointer for the first entry. + * @pte: Page table entry for the first page. + * @nr: Number of pages to map. + * + * May be overridden by the architecture, or the architecture can define + * set_pte() and PFN_PTE_SHIFT. + * + * Context: The caller holds the page table lock. The pages all belong + * to the same folio. The PTEs are all in the same PMD. + */ +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + page_table_check_ptes_set(mm, ptep, pte, nr); + + arch_enter_lazy_mmu_mode(); + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + } + arch_leave_lazy_mmu_mode(); +} +#ifndef set_pte_at +#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) +#endif +#endif +#else +#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) +#endif + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, @@ -1051,27 +1111,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define pgprot_decrypted(prot) (prot) #endif -/* - * A facility to provide lazy MMU batching. This allows PTE updates and - * page invalidations to be delayed until a call to leave lazy MMU mode - * is issued. Some architectures may benefit from doing this, and it is - * beneficial for both shadow and direct mode hypervisors, which may batch - * the PTE updates which happen during this window. Note that using this - * interface requires that read hazards be removed from the code. A read - * hazard could result in the direct mode hypervisor case, since the actual - * write to the page tables may not yet have taken place, so reads though - * a raw PTE pointer after it has been modified are not guaranteed to be - * up to date. This mode can only be entered and left under the protection of - * the page table locks for all page tables which may be modified. In the UP - * case, this is required so that preemption is disabled, and in the SMP case, - * it must synchronize the delayed page table writes properly on other CPUs. - */ -#ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE -#define arch_enter_lazy_mmu_mode() do {} while (0) -#define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_flush_lazy_mmu_mode() do {} while (0) -#endif - /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for From 63497b716be30fb268b2358836efb4bb9e615f15 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:35 +0100 Subject: [PATCH 436/489] alpha: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_icache_pages(). Link: https://lkml.kernel.org/r/20230802151406.3735276-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Signed-off-by: Andrew Morton --- arch/alpha/include/asm/cacheflush.h | 10 ++++++++++ arch/alpha/include/asm/pgtable.h | 10 ++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 9945ff483eaf7e..3956460e69e277 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -57,6 +57,16 @@ extern void flush_icache_user_page(struct vm_area_struct *vma, #define flush_icache_page(vma, page) \ flush_icache_user_page((vma), (page), 0, 0) +/* + * Both implementations of flush_icache_user_page flush the entire + * address space, so one call, no matter how many pages. + */ +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) +{ + flush_icache_user_page(vma, page, 0, 0); +} + #include #endif /* _ALPHA_CACHEFLUSH_H */ diff --git a/arch/alpha/include/asm/pgtable.h b/arch/alpha/include/asm/pgtable.h index ba43cb841d19ca..747b5f706c47ff 100644 --- a/arch/alpha/include/asm/pgtable.h +++ b/arch/alpha/include/asm/pgtable.h @@ -26,7 +26,6 @@ struct vm_area_struct; * hook is made available. */ #define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) /* PMD_SHIFT determines the size of the area a second-level page table can map */ #define PMD_SHIFT (PAGE_SHIFT + (PAGE_SHIFT-3)) @@ -189,7 +188,8 @@ extern unsigned long __zero_page(void); * and a page entry and page directory to the page they refer to. */ #define page_to_pa(page) (page_to_pfn(page) << PAGE_SHIFT) -#define pte_pfn(pte) (pte_val(pte) >> 32) +#define PFN_PTE_SHIFT 32 +#define pte_pfn(pte) (pte_val(pte) >> PFN_PTE_SHIFT) #define pte_page(pte) pfn_to_page(pte_pfn(pte)) #define mk_pte(page, pgprot) \ @@ -303,6 +303,12 @@ extern inline void update_mmu_cache(struct vm_area_struct * vma, { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) +{ +} + /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that * are !pte_none() && !pte_present(). From ac4cfaccedac891d29560ddfb64cb5c1e710e1e1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:36 +0100 Subject: [PATCH 437/489] arc: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_dc_clean flag from being per-page to per-folio (which means it cannot always be set as we don't know that all pages in this folio were cleaned). Enhance the internal flush routines to take the number of pages to flush. Link: https://lkml.kernel.org/r/20230802151406.3735276-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Vineet Gupta Signed-off-by: Andrew Morton --- arch/arc/include/asm/cacheflush.h | 7 ++- arch/arc/include/asm/pgtable-bits-arcv2.h | 12 ++--- arch/arc/include/asm/pgtable-levels.h | 1 + arch/arc/mm/cache.c | 61 ++++++++++++++--------- arch/arc/mm/tlb.c | 18 ++++--- 5 files changed, 59 insertions(+), 40 deletions(-) diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index e201b4b1655afe..04f65f58851070 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -25,17 +25,20 @@ * in update_mmu_cache() */ #define flush_icache_page(vma, page) +#define flush_icache_pages(vma, page, nr) void flush_cache_all(void); void flush_icache_range(unsigned long kstart, unsigned long kend); void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len); -void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr); -void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr); +void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr); +void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio void dma_cache_wback_inv(phys_addr_t start, unsigned long sz); void dma_cache_inv(phys_addr_t start, unsigned long sz); diff --git a/arch/arc/include/asm/pgtable-bits-arcv2.h b/arch/arc/include/asm/pgtable-bits-arcv2.h index 6e9f8ca6d6a16c..ee78ab30958df7 100644 --- a/arch/arc/include/asm/pgtable-bits-arcv2.h +++ b/arch/arc/include/asm/pgtable-bits-arcv2.h @@ -100,14 +100,12 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - set_pte(ptep, pteval); -} +struct vm_fault; +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep); +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that diff --git a/arch/arc/include/asm/pgtable-levels.h b/arch/arc/include/asm/pgtable-levels.h index ef68758b69f7e6..fc417c75c24db3 100644 --- a/arch/arc/include/asm/pgtable-levels.h +++ b/arch/arc/include/asm/pgtable-levels.h @@ -169,6 +169,7 @@ #define pte_ERROR(e) \ pr_crit("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_none(x) (!pte_val(x)) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_clear(mm,addr,ptep) set_pte_at(mm, addr, ptep, __pte(0)) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 55c6de138eae03..3c16ee942a5c5a 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -752,17 +752,17 @@ static inline void arc_slc_enable(void) * There's a corollary case, where kernel READs from a userspace mapped page. * If the U-mapping is not congruent to K-mapping, former needs flushing. */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; if (!cache_is_vipt_aliasing()) { - clear_bit(PG_dc_clean, &page->flags); + clear_bit(PG_dc_clean, &folio->flags); return; } /* don't handle anon pages here */ - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (!mapping) return; @@ -771,17 +771,27 @@ void flush_dcache_page(struct page *page) * Make a note that K-mapping is dirty */ if (!mapping_mapped(mapping)) { - clear_bit(PG_dc_clean, &page->flags); - } else if (page_mapcount(page)) { - + clear_bit(PG_dc_clean, &folio->flags); + } else if (folio_mapped(folio)) { /* kernel reading from page with U-mapping */ - phys_addr_t paddr = (unsigned long)page_address(page); - unsigned long vaddr = page->index << PAGE_SHIFT; + phys_addr_t paddr = (unsigned long)folio_address(folio); + unsigned long vaddr = folio_pos(folio); + /* + * vaddr is not actually the virtual address, but is + * congruent to every user mapping. + */ if (addr_not_cache_congruent(paddr, vaddr)) - __flush_dcache_page(paddr, vaddr); + __flush_dcache_pages(paddr, vaddr, + folio_nr_pages(folio)); } } +EXPORT_SYMBOL(flush_dcache_folio); + +void flush_dcache_page(struct page *page) +{ + return flush_dcache_folio(page_folio(page)); +} EXPORT_SYMBOL(flush_dcache_page); /* @@ -921,18 +931,18 @@ void __sync_icache_dcache(phys_addr_t paddr, unsigned long vaddr, int len) } /* wrapper to compile time eliminate alignment checks in flush loop */ -void __inv_icache_page(phys_addr_t paddr, unsigned long vaddr) +void __inv_icache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr) { - __ic_line_inv_vaddr(paddr, vaddr, PAGE_SIZE); + __ic_line_inv_vaddr(paddr, vaddr, nr * PAGE_SIZE); } /* * wrapper to clearout kernel or userspace mappings of a page * For kernel mappings @vaddr == @paddr */ -void __flush_dcache_page(phys_addr_t paddr, unsigned long vaddr) +void __flush_dcache_pages(phys_addr_t paddr, unsigned long vaddr, unsigned nr) { - __dc_line_op(paddr, vaddr & PAGE_MASK, PAGE_SIZE, OP_FLUSH_N_INV); + __dc_line_op(paddr, vaddr & PAGE_MASK, nr * PAGE_SIZE, OP_FLUSH_N_INV); } noinline void flush_cache_all(void) @@ -962,10 +972,10 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long u_vaddr, u_vaddr &= PAGE_MASK; - __flush_dcache_page(paddr, u_vaddr); + __flush_dcache_pages(paddr, u_vaddr, 1); if (vma->vm_flags & VM_EXEC) - __inv_icache_page(paddr, u_vaddr); + __inv_icache_pages(paddr, u_vaddr, 1); } void flush_cache_range(struct vm_area_struct *vma, unsigned long start, @@ -978,9 +988,9 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long u_vaddr) { /* TBD: do we really need to clear the kernel mapping */ - __flush_dcache_page((phys_addr_t)page_address(page), u_vaddr); - __flush_dcache_page((phys_addr_t)page_address(page), - (phys_addr_t)page_address(page)); + __flush_dcache_pages((phys_addr_t)page_address(page), u_vaddr, 1); + __flush_dcache_pages((phys_addr_t)page_address(page), + (phys_addr_t)page_address(page), 1); } @@ -989,6 +999,8 @@ void flush_anon_page(struct vm_area_struct *vma, struct page *page, void copy_user_highpage(struct page *to, struct page *from, unsigned long u_vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); + struct folio *dst = page_folio(to); void *kfrom = kmap_atomic(from); void *kto = kmap_atomic(to); int clean_src_k_mappings = 0; @@ -1005,7 +1017,7 @@ void copy_user_highpage(struct page *to, struct page *from, * addr_not_cache_congruent() is 0 */ if (page_mapcount(from) && addr_not_cache_congruent(kfrom, u_vaddr)) { - __flush_dcache_page((unsigned long)kfrom, u_vaddr); + __flush_dcache_pages((unsigned long)kfrom, u_vaddr, 1); clean_src_k_mappings = 1; } @@ -1019,17 +1031,17 @@ void copy_user_highpage(struct page *to, struct page *from, * non copied user pages (e.g. read faults which wire in pagecache page * directly). */ - clear_bit(PG_dc_clean, &to->flags); + clear_bit(PG_dc_clean, &dst->flags); /* * if SRC was already usermapped and non-congruent to kernel mapping * sync the kernel mapping back to physical page */ if (clean_src_k_mappings) { - __flush_dcache_page((unsigned long)kfrom, (unsigned long)kfrom); - set_bit(PG_dc_clean, &from->flags); + __flush_dcache_pages((unsigned long)kfrom, + (unsigned long)kfrom, 1); } else { - clear_bit(PG_dc_clean, &from->flags); + clear_bit(PG_dc_clean, &src->flags); } kunmap_atomic(kto); @@ -1038,8 +1050,9 @@ void copy_user_highpage(struct page *to, struct page *from, void clear_user_page(void *to, unsigned long u_vaddr, struct page *page) { + struct folio *folio = page_folio(page); clear_page(to); - clear_bit(PG_dc_clean, &page->flags); + clear_bit(PG_dc_clean, &folio->flags); } EXPORT_SYMBOL(clear_user_page); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index 5f71445f26bded..6f40f37e655010 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -467,8 +467,8 @@ void create_tlb(struct vm_area_struct *vma, unsigned long vaddr, pte_t *ptep) * Note that flush (when done) involves both WBACK - so physical page is * in sync as well as INV - so any non-congruent aliases don't remain */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned, - pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long vaddr_unaligned, pte_t *ptep, unsigned int nr) { unsigned long vaddr = vaddr_unaligned & PAGE_MASK; phys_addr_t paddr = pte_val(*ptep) & PAGE_MASK_PHYS; @@ -491,15 +491,19 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long vaddr_unaligned, */ if ((vma->vm_flags & VM_EXEC) || addr_not_cache_congruent(paddr, vaddr)) { - - int dirty = !test_and_set_bit(PG_dc_clean, &page->flags); + struct folio *folio = page_folio(page); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); if (dirty) { + unsigned long offset = offset_in_folio(folio, paddr); + nr = folio_nr_pages(folio); + paddr -= offset; + vaddr -= offset; /* wback + inv dcache lines (K-mapping) */ - __flush_dcache_page(paddr, paddr); + __flush_dcache_pages(paddr, paddr, nr); /* invalidate any existing icache lines (U-mapping) */ if (vma->vm_flags & VM_EXEC) - __inv_icache_page(paddr, vaddr); + __inv_icache_pages(paddr, vaddr, nr); } } } @@ -531,7 +535,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { pte_t pte = __pte(pmd_val(*pmd)); - update_mmu_cache(vma, addr, &pte); + update_mmu_cache_range(NULL, vma, addr, &pte, HPAGE_PMD_NR); } void local_flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, From 8b5989f3333717273d02ab87ba8781f72a6783ab Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:37 +0100 Subject: [PATCH 438/489] arm: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_dcache_clear flag from being per-page to per-folio which makes __dma_page_dev_to_cpu() a bit more exciting. Also add flush_cache_pages(), even though this isn't used by generic code (yet?) [m.szyprowski@samsung.com: fix potential endless loop in __dma_page_dev_to_cpu()] Link: https://lkml.kernel.org/r/20230809172737.3574190-1-m.szyprowski@samsung.com [willy@infradead.org: fix folio conversion in __dma_page_dev_to_cpu()] Link: https://lkml.kernel.org/r/20230823191852.1556561-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Marek Szyprowski Acked-by: Mike Rapoport (IBM) Reviewed-by: Russell King (Oracle) Signed-off-by: Andrew Morton --- arch/arm/include/asm/cacheflush.h | 24 +++++--- arch/arm/include/asm/pgtable.h | 5 +- arch/arm/include/asm/tlbflush.h | 14 +++-- arch/arm/mm/copypage-v4mc.c | 5 +- arch/arm/mm/copypage-v6.c | 5 +- arch/arm/mm/copypage-xscale.c | 5 +- arch/arm/mm/dma-mapping.c | 28 +++++---- arch/arm/mm/fault-armv.c | 16 ++--- arch/arm/mm/flush.c | 99 +++++++++++++++++++------------ arch/arm/mm/mm.h | 2 +- arch/arm/mm/mmu.c | 14 +++-- arch/arm/mm/nommu.c | 6 ++ 12 files changed, 136 insertions(+), 87 deletions(-) diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index a094f964c86929..841e268d23747c 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -231,14 +231,15 @@ vivt_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned vma->vm_flags); } -static inline void -vivt_flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) +static inline void vivt_flush_cache_pages(struct vm_area_struct *vma, + unsigned long user_addr, unsigned long pfn, unsigned int nr) { struct mm_struct *mm = vma->vm_mm; if (!mm || cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) { unsigned long addr = user_addr & PAGE_MASK; - __cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags); + __cpuc_flush_user_range(addr, addr + nr * PAGE_SIZE, + vma->vm_flags); } } @@ -247,15 +248,17 @@ vivt_flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsig vivt_flush_cache_mm(mm) #define flush_cache_range(vma,start,end) \ vivt_flush_cache_range(vma,start,end) -#define flush_cache_page(vma,addr,pfn) \ - vivt_flush_cache_page(vma,addr,pfn) +#define flush_cache_pages(vma, addr, pfn, nr) \ + vivt_flush_cache_pages(vma, addr, pfn, nr) #else -extern void flush_cache_mm(struct mm_struct *mm); -extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn); +void flush_cache_mm(struct mm_struct *mm); +void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +void flush_cache_pages(struct vm_area_struct *vma, unsigned long user_addr, + unsigned long pfn, unsigned int nr); #endif #define flush_cache_dup_mm(mm) flush_cache_mm(mm) +#define flush_cache_page(vma, addr, pfn) flush_cache_pages(vma, addr, pfn, 1) /* * flush_icache_user_range is used when we want to ensure that the @@ -289,7 +292,9 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr * See update_mmu_cache for the user space part. */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -extern void flush_dcache_page(struct page *); +void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE 1 static inline void flush_kernel_vmap_range(void *addr, int size) @@ -321,6 +326,7 @@ static inline void flush_anon_page(struct vm_area_struct *vma, * duplicate cache flushing elsewhere performed by flush_dcache_page(). */ #define flush_icache_page(vma,page) do { } while (0) +#define flush_icache_pages(vma, page, nr) do { } while (0) /* * flush_cache_vmap() is used when creating mappings (eg, via vmap, diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h index 34662a9d4cabf7..ba573f22d7cc07 100644 --- a/arch/arm/include/asm/pgtable.h +++ b/arch/arm/include/asm/pgtable.h @@ -207,8 +207,9 @@ static inline void __sync_icache_dcache(pte_t pteval) extern void __sync_icache_dcache(pte_t pteval); #endif -void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); +void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval, unsigned int nr); +#define set_ptes set_ptes static inline pte_t clear_pte_bit(pte_t pte, pgprot_t prot) { diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index 0ccc985b90af0e..38c6e4a2a0b60f 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -619,18 +619,22 @@ extern void flush_bp_all(void); * If PG_dcache_clean is not set for the page, we need to ensure that any * cache entries for the kernels virtual memory range are written * back to the page. On ARMv6 and later, the cache coherency is handled via - * the set_pte_at() function. + * the set_ptes() function. */ #if __LINUX_ARM_ARCH__ < 6 -extern void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr); #else -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + unsigned int nr) { } #endif +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #endif diff --git a/arch/arm/mm/copypage-v4mc.c b/arch/arm/mm/copypage-v4mc.c index f1da3b439b968a..7ddd82b9fe8b2f 100644 --- a/arch/arm/mm/copypage-v4mc.c +++ b/arch/arm/mm/copypage-v4mc.c @@ -64,10 +64,11 @@ static void mc_copy_user_page(void *from, void *to) void v4_mc_copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping_file(from), from); + if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c index d8a115de5507ae..a1a71f36d85027 100644 --- a/arch/arm/mm/copypage-v6.c +++ b/arch/arm/mm/copypage-v6.c @@ -69,11 +69,12 @@ static void discard_old_kernel_data(void *kto) static void v6_copy_user_highpage_aliasing(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); unsigned int offset = CACHE_COLOUR(vaddr); unsigned long kfrom, kto; - if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping_file(from), from); + if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + __flush_dcache_folio(folio_flush_mapping(src), src); /* FIXME: not highmem safe */ discard_old_kernel_data(page_address(to)); diff --git a/arch/arm/mm/copypage-xscale.c b/arch/arm/mm/copypage-xscale.c index bcb485620a05f0..f1e29d3e81930d 100644 --- a/arch/arm/mm/copypage-xscale.c +++ b/arch/arm/mm/copypage-xscale.c @@ -84,10 +84,11 @@ static void mc_copy_user_page(void *from, void *to) void xscale_mc_copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *kto = kmap_atomic(to); - if (!test_and_set_bit(PG_dcache_clean, &from->flags)) - __flush_dcache_page(page_mapping_file(from), from); + if (!test_and_set_bit(PG_dcache_clean, &src->flags)) + __flush_dcache_folio(folio_flush_mapping(src), src); raw_spin_lock(&minicache_lock); diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 033a1bce2b175e..5409225b4abc06 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -709,19 +709,21 @@ static void __dma_page_dev_to_cpu(struct page *page, unsigned long off, * Mark the D-cache clean for these pages to avoid extra flushing. */ if (dir != DMA_TO_DEVICE && size >= PAGE_SIZE) { - unsigned long pfn; - size_t left = size; - - pfn = page_to_pfn(page) + off / PAGE_SIZE; - off %= PAGE_SIZE; - if (off) { - pfn++; - left -= PAGE_SIZE - off; - } - while (left >= PAGE_SIZE) { - page = pfn_to_page(pfn++); - set_bit(PG_dcache_clean, &page->flags); - left -= PAGE_SIZE; + struct folio *folio = pfn_folio(paddr / PAGE_SIZE); + size_t offset = offset_in_folio(folio, paddr); + + for (;;) { + size_t sz = folio_size(folio) - offset; + + if (size < sz) + break; + if (!offset) + set_bit(PG_dcache_clean, &folio->flags); + offset = 0; + size -= sz; + if (!size) + break; + folio = folio_next(folio); } } } diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 7cb12549797621..2286c2ea60ec48 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -180,12 +180,12 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, * * Note that the pte lock will be held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { unsigned long pfn = pte_pfn(*ptep); struct address_space *mapping; - struct page *page; + struct folio *folio; if (!pfn_valid(pfn)) return; @@ -194,13 +194,13 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; - mapping = page_mapping_file(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); + folio = page_folio(pfn_to_page(pfn)); + mapping = folio_flush_mapping(folio); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) make_coherent(mapping, vma, addr, ptep, pfn); diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 2508be91b7a006..d19d140a10c7d5 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -95,10 +95,10 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned __flush_icache_all(); } -void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) +void flush_cache_pages(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn, unsigned int nr) { if (cache_is_vivt()) { - vivt_flush_cache_page(vma, user_addr, pfn); + vivt_flush_cache_pages(vma, user_addr, pfn, nr); return; } @@ -196,29 +196,31 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, #endif } -void __flush_dcache_page(struct address_space *mapping, struct page *page) +void __flush_dcache_folio(struct address_space *mapping, struct folio *folio) { /* * Writeback any data associated with the kernel mapping of this * page. This ensures that data in the physical page is mutually * coherent with the kernels mapping. */ - if (!PageHighMem(page)) { - __cpuc_flush_dcache_area(page_address(page), page_size(page)); + if (!folio_test_highmem(folio)) { + __cpuc_flush_dcache_area(folio_address(folio), + folio_size(folio)); } else { unsigned long i; if (cache_is_vipt_nonaliasing()) { - for (i = 0; i < compound_nr(page); i++) { - void *addr = kmap_atomic(page + i); + for (i = 0; i < folio_nr_pages(folio); i++) { + void *addr = kmap_local_folio(folio, + i * PAGE_SIZE); __cpuc_flush_dcache_area(addr, PAGE_SIZE); - kunmap_atomic(addr); + kunmap_local(addr); } } else { - for (i = 0; i < compound_nr(page); i++) { - void *addr = kmap_high_get(page + i); + for (i = 0; i < folio_nr_pages(folio); i++) { + void *addr = kmap_high_get(folio_page(folio, i)); if (addr) { __cpuc_flush_dcache_area(addr, PAGE_SIZE); - kunmap_high(page + i); + kunmap_high(folio_page(folio, i)); } } } @@ -230,15 +232,14 @@ void __flush_dcache_page(struct address_space *mapping, struct page *page) * userspace colour, which is congruent with page->index. */ if (mapping && cache_is_vipt_aliasing()) - flush_pfn_alias(page_to_pfn(page), - page->index << PAGE_SHIFT); + flush_pfn_alias(folio_pfn(folio), folio_pos(folio)); } -static void __flush_dcache_aliases(struct address_space *mapping, struct page *page) +static void __flush_dcache_aliases(struct address_space *mapping, struct folio *folio) { struct mm_struct *mm = current->active_mm; - struct vm_area_struct *mpnt; - pgoff_t pgoff; + struct vm_area_struct *vma; + pgoff_t pgoff, pgoff_end; /* * There are possible user space mappings of this page: @@ -246,21 +247,36 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p * data in the current VM view associated with this page. * - aliasing VIPT: we only need to find one mapping of this page. */ - pgoff = page->index; + pgoff = folio->index; + pgoff_end = pgoff + folio_nr_pages(folio) - 1; flush_dcache_mmap_lock(mapping); - vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { - unsigned long offset; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff_end) { + unsigned long start, offset, pfn; + unsigned int nr; /* * If this VMA is not in our MM, we can ignore it. */ - if (mpnt->vm_mm != mm) + if (vma->vm_mm != mm) continue; - if (!(mpnt->vm_flags & VM_MAYSHARE)) + if (!(vma->vm_flags & VM_MAYSHARE)) continue; - offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - flush_cache_page(mpnt, mpnt->vm_start + offset, page_to_pfn(page)); + + start = vma->vm_start; + pfn = folio_pfn(folio); + nr = folio_nr_pages(folio); + offset = pgoff - vma->vm_pgoff; + if (offset > -nr) { + pfn -= offset; + nr += offset; + } else { + start += offset * PAGE_SIZE; + } + if (start + nr * PAGE_SIZE > vma->vm_end) + nr = (vma->vm_end - start) / PAGE_SIZE; + + flush_cache_pages(vma, start, pfn, nr); } flush_dcache_mmap_unlock(mapping); } @@ -269,7 +285,7 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p void __sync_icache_dcache(pte_t pteval) { unsigned long pfn; - struct page *page; + struct folio *folio; struct address_space *mapping; if (cache_is_vipt_nonaliasing() && !pte_exec(pteval)) @@ -279,14 +295,14 @@ void __sync_icache_dcache(pte_t pteval) if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); + folio = page_folio(pfn_to_page(pfn)); if (cache_is_vipt_aliasing()) - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); else mapping = NULL; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + __flush_dcache_folio(mapping, folio); if (pte_exec(pteval)) __flush_icache_all(); @@ -312,7 +328,7 @@ void __sync_icache_dcache(pte_t pteval) * Note that we disable the lazy flush for SMP configurations where * the cache maintenance operations are not automatically broadcasted. */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; @@ -320,31 +336,36 @@ void flush_dcache_page(struct page *page) * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(folio_pfn(folio))) return; if (!cache_ops_need_broadcast() && cache_is_vipt_nonaliasing()) { - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); return; } - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (!cache_ops_need_broadcast() && - mapping && !page_mapcount(page)) - clear_bit(PG_dcache_clean, &page->flags); + mapping && !folio_mapped(folio)) + clear_bit(PG_dcache_clean, &folio->flags); else { - __flush_dcache_page(mapping, page); + __flush_dcache_folio(mapping, folio); if (mapping && cache_is_vivt()) - __flush_dcache_aliases(mapping, page); + __flush_dcache_aliases(mapping, folio); else if (mapping) __flush_icache_all(); - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} +EXPORT_SYMBOL(flush_dcache_page); /* * Flush an anonymous page so that users of get_user_pages() * can safely access the data. The expected sequence is: diff --git a/arch/arm/mm/mm.h b/arch/arm/mm/mm.h index d7ffccb7fea7d6..41931631671161 100644 --- a/arch/arm/mm/mm.h +++ b/arch/arm/mm/mm.h @@ -45,7 +45,7 @@ struct mem_type { const struct mem_type *get_mem_type(unsigned int type); -extern void __flush_dcache_page(struct address_space *mapping, struct page *page); +void __flush_dcache_folio(struct address_space *mapping, struct folio *folio); /* * ARM specific vm_struct->flags bits. diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index fdeaee30d167bf..674ed71573a84c 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1789,7 +1789,7 @@ void __init paging_init(const struct machine_desc *mdesc) bootmem_init(); empty_zero_page = virt_to_page(zero_page); - __flush_dcache_page(NULL, empty_zero_page); + __flush_dcache_folio(NULL, page_folio(empty_zero_page)); } void __init early_mm_init(const struct machine_desc *mdesc) @@ -1798,8 +1798,8 @@ void __init early_mm_init(const struct machine_desc *mdesc) early_paging_init(mdesc); } -void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval, unsigned int nr) { unsigned long ext = 0; @@ -1809,5 +1809,11 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, ext |= PTE_EXT_NG; } - set_pte_ext(ptep, pteval, ext); + for (;;) { + set_pte_ext(ptep, pteval, ext); + if (--nr == 0) + break; + ptep++; + pte_val(pteval) += PAGE_SIZE; + } } diff --git a/arch/arm/mm/nommu.c b/arch/arm/mm/nommu.c index 43cfd06bbebad2..c415f3859b20e0 100644 --- a/arch/arm/mm/nommu.c +++ b/arch/arm/mm/nommu.c @@ -180,6 +180,12 @@ void setup_mm_for_reboot(void) { } +void flush_dcache_folio(struct folio *folio) +{ + __cpuc_flush_dcache_area(folio_address(folio), folio_size(folio)); +} +EXPORT_SYMBOL(flush_dcache_folio); + void flush_dcache_page(struct page *page) { __cpuc_flush_dcache_area(page_address(page), PAGE_SIZE); From 4a169d61c2ede9fdf27103e1f454d4a0401d9025 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:38 +0100 Subject: [PATCH 439/489] arm64: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Catalin Marinas Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/arm64/include/asm/cacheflush.h | 4 +++- arch/arm64/include/asm/pgtable.h | 26 +++++++++++++++------ arch/arm64/mm/flush.c | 36 +++++++++++------------------ 3 files changed, 36 insertions(+), 30 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 37185e978aeb20..d115451ed263d9 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -114,7 +114,7 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, #define copy_to_user_page copy_to_user_page /* - * flush_dcache_page is used when the kernel has written to the page + * flush_dcache_folio is used when the kernel has written to the page * cache page at virtual address page->virtual. * * If this page isn't mapped (ie, page_mapping == NULL), or it might @@ -127,6 +127,8 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); +#define flush_dcache_folio flush_dcache_folio static __always_inline void icache_inval_all_pou(void) { diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 445b18d7a47c46..76bba654b5d776 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -345,12 +345,21 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, set_pte(ptep, pte); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_ptes_set(mm, ptep, pte, 1); - return __set_pte_at(mm, addr, ptep, pte); +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + page_table_check_ptes_set(mm, ptep, pte, nr); + + for (;;) { + __set_pte_at(mm, addr, ptep, pte); + if (--nr == 0) + break; + ptep++; + addr += PAGE_SIZE; + pte_val(pte) += PAGE_SIZE; + } } +#define set_ptes set_ptes /* * Huge pte definitions. @@ -1049,8 +1058,9 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio) /* * On AArch64, the cache coherency is handled via the set_pte_at() function. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, + unsigned int nr) { /* * We don't do anything here, so there's a very small chance of @@ -1059,6 +1069,8 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, */ } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) #ifdef CONFIG_ARM64_PA_BITS_52 diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 4e647609495222..013eead9b69506 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -51,20 +51,13 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, void __sync_icache_dcache(pte_t pte) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - /* - * HugeTLB pages are always fully mapped, so only setting head page's - * PG_dcache_clean flag is enough. - */ - if (PageHuge(page)) - page = compound_head(page); - - if (!test_bit(PG_dcache_clean, &page->flags)) { - sync_icache_aliases((unsigned long)page_address(page), - (unsigned long)page_address(page) + - page_size(page)); - set_bit(PG_dcache_clean, &page->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + sync_icache_aliases((unsigned long)folio_address(folio), + (unsigned long)folio_address(folio) + + folio_size(folio)); + set_bit(PG_dcache_clean, &folio->flags); } } EXPORT_SYMBOL_GPL(__sync_icache_dcache); @@ -74,17 +67,16 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache); * it as dirty for later flushing when mapped in user space (if executable, * see __sync_icache_dcache). */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - /* - * HugeTLB pages are always fully mapped and only head page will be - * set PG_dcache_clean (see comments in __sync_icache_dcache()). - */ - if (PageHuge(page)) - page = compound_head(page); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +EXPORT_SYMBOL(flush_dcache_folio); - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); } EXPORT_SYMBOL(flush_dcache_page); From e724e7aaf9ca794670a4d4931af7a7e24e37fec3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:39 +0100 Subject: [PATCH 440/489] csky: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Guo Ren Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/csky/abiv1/cacheflush.c | 32 +++++++++++++++++----------- arch/csky/abiv1/inc/abi/cacheflush.h | 2 ++ arch/csky/abiv2/cacheflush.c | 32 ++++++++++++++-------------- arch/csky/abiv2/inc/abi/cacheflush.h | 10 +++++++-- arch/csky/include/asm/pgtable.h | 8 ++++--- 5 files changed, 50 insertions(+), 34 deletions(-) diff --git a/arch/csky/abiv1/cacheflush.c b/arch/csky/abiv1/cacheflush.c index 94fbc03cbe703d..171e8fb32285db 100644 --- a/arch/csky/abiv1/cacheflush.c +++ b/arch/csky/abiv1/cacheflush.c @@ -15,45 +15,51 @@ #define PG_dcache_clean PG_arch_1 -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(folio_pfn(folio))) return; - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); - if (mapping && !page_mapcount(page)) - clear_bit(PG_dcache_clean, &page->flags); + if (mapping && !folio_mapped(folio)) + clear_bit(PG_dcache_clean, &folio->flags); else { dcache_wbinv_all(); if (mapping) icache_inv_all(); - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } +EXPORT_SYMBOL(flush_dcache_folio); + +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { unsigned long pfn = pte_pfn(*ptep); - struct page *page; + struct folio *folio; flush_tlb_page(vma, addr); if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) + folio = page_folio(pfn_to_page(pfn)); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) dcache_wbinv_all(); - if (page_mapping_file(page)) { + if (folio_flush_mapping(folio)) { if (vma->vm_flags & VM_EXEC) icache_inv_all(); } diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index ed62e2066ba761..0d6cb65624c43b 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -9,6 +9,8 @@ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +void flush_dcache_folio(struct folio *); +#define flush_dcache_folio flush_dcache_folio #define flush_cache_mm(mm) dcache_wbinv_all() #define flush_cache_page(vma, page, pfn) cache_wbinv_all() diff --git a/arch/csky/abiv2/cacheflush.c b/arch/csky/abiv2/cacheflush.c index 9923cd24db5832..d05a551af5d532 100644 --- a/arch/csky/abiv2/cacheflush.c +++ b/arch/csky/abiv2/cacheflush.c @@ -7,32 +7,32 @@ #include #include -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *pte, unsigned int nr) { - unsigned long addr; - struct page *page; + unsigned long pfn = pte_pfn(*pte); + struct folio *folio; + unsigned int i; flush_tlb_page(vma, address); - if (!pfn_valid(pte_pfn(*pte))) + if (!pfn_valid(pfn)) return; - page = pfn_to_page(pte_pfn(*pte)); - if (page == ZERO_PAGE(0)) - return; + folio = page_folio(pfn_to_page(pfn)); - if (test_and_set_bit(PG_dcache_clean, &page->flags)) + if (test_and_set_bit(PG_dcache_clean, &folio->flags)) return; - addr = (unsigned long) kmap_atomic(page); - - dcache_wb_range(addr, addr + PAGE_SIZE); + for (i = 0; i < folio_nr_pages(folio); i++) { + unsigned long addr = (unsigned long) kmap_local_folio(folio, + i * PAGE_SIZE); - if (vma->vm_flags & VM_EXEC) - icache_inv_range(addr, addr + PAGE_SIZE); - - kunmap_atomic((void *) addr); + dcache_wb_range(addr, addr + PAGE_SIZE); + if (vma->vm_flags & VM_EXEC) + icache_inv_range(addr, addr + PAGE_SIZE); + kunmap_local((void *) addr); + } } void flush_icache_deferred(struct mm_struct *mm) diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index a565e00c3f70b2..9c728933a7764d 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -18,11 +18,17 @@ #define PG_dcache_clean PG_arch_1 +static inline void flush_dcache_folio(struct folio *folio) +{ + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_page(struct page *page) { - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + flush_dcache_folio(page_folio(page)); } #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/csky/include/asm/pgtable.h b/arch/csky/include/asm/pgtable.h index d4042495febc06..42405037c87122 100644 --- a/arch/csky/include/asm/pgtable.h +++ b/arch/csky/include/asm/pgtable.h @@ -28,6 +28,7 @@ #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pmd_pfn(pmd) (pmd_phys(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_phys(pmd) >> PAGE_SHIFT)) #define pte_clear(mm, addr, ptep) set_pte((ptep), \ @@ -90,7 +91,6 @@ static inline void set_pte(pte_t *p, pte_t pte) /* prevent out of order excution */ smp_mb(); } -#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) static inline pte_t *pmd_page_vaddr(pmd_t pmd) { @@ -263,8 +263,10 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init(void); -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, - pte_t *pte); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *pte, unsigned int nr); +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ remap_pfn_range(vma, vaddr, pfn, size, prot) From 9ff633944165d11c53c088d9596db3da66e90396 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:40 +0100 Subject: [PATCH 441/489] hexagon: implement the new page table range API Add PFN_PTE_SHIFT and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Brian Cain Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/hexagon/include/asm/cacheflush.h | 8 ++++++-- arch/hexagon/include/asm/pgtable.h | 9 +-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index 6eff0730e6efd3..dc3f500a5a01bd 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -58,12 +58,16 @@ extern void flush_cache_all_hexagon(void); * clean the cache when the PTE is set. * */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { /* generic_ptrace_pokedata doesn't wind up here, does it? */ } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); #define copy_to_user_page copy_to_user_page diff --git a/arch/hexagon/include/asm/pgtable.h b/arch/hexagon/include/asm/pgtable.h index 59393613d0862e..dd05dd71b8ec31 100644 --- a/arch/hexagon/include/asm/pgtable.h +++ b/arch/hexagon/include/asm/pgtable.h @@ -338,6 +338,7 @@ static inline int pte_exec(pte_t pte) /* __swp_entry_to_pte - extract PTE from swap entry */ #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) +#define PFN_PTE_SHIFT PAGE_SHIFT /* pfn_pte - convert page number and protection value to page table entry */ #define pfn_pte(pfn, pgprot) __pte((pfn << PAGE_SHIFT) | pgprot_val(pgprot)) @@ -345,14 +346,6 @@ static inline int pte_exec(pte_t pte) #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval)) -/* - * set_pte_at - update page table and do whatever magic may be - * necessary to make the underlying hardware/firmware take note. - * - * VM may require a virtual instruction to alert the MMU. - */ -#define set_pte_at(mm, addr, ptep, pte) set_pte(ptep, pte) - static inline unsigned long pmd_page_vaddr(pmd_t pmd) { return (unsigned long)__va(pmd_val(pmd) & PAGE_MASK); From 876397837d582ce72f977ac3e635ce74eebcecc9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:41 +0100 Subject: [PATCH 442/489] ia64: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_clean) flag from being per-page to per-folio, which makes arch_dma_mark_clean() and mark_clean() a little more exciting. [willy@infradead.org: fix folio_size() handling] Link: https://lkml.kernel.org/r/ZNPlOCe8F+nrzPxr@casper.infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/ia64/hp/common/sba_iommu.c | 30 ++++++++++++++++++---------- arch/ia64/include/asm/cacheflush.h | 14 +++++++++---- arch/ia64/include/asm/pgtable.h | 4 ++-- arch/ia64/mm/init.c | 32 +++++++++++++++++++++--------- 4 files changed, 54 insertions(+), 26 deletions(-) diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 8ad6946521d88d..c4d477e8bcd422 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -798,22 +798,30 @@ sba_io_pdir_entry(u64 *pdir_ptr, unsigned long vba) #endif #ifdef ENABLE_MARK_CLEAN -/** +/* * Since DMA is i-cache coherent, any (complete) pages that were written via * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ -static void -mark_clean (void *addr, size_t size) +static void mark_clean(void *addr, size_t size) { - unsigned long pg_addr, end; - - pg_addr = PAGE_ALIGN((unsigned long) addr); - end = (unsigned long) addr + size; - while (pg_addr + PAGE_SIZE <= end) { - struct page *page = virt_to_page((void *)pg_addr); - set_bit(PG_arch_1, &page->flags); - pg_addr += PAGE_SIZE; + struct folio *folio = virt_to_folio(addr); + ssize_t left = size; + size_t offset = offset_in_folio(folio, addr); + + if (offset) { + left -= folio_size(folio) - offset; + if (left <= 0) + return; + folio = folio_next(folio); + } + + while (left >= folio_size(folio)) { + left -= folio_size(folio); + set_bit(PG_arch_1, &folio->flags); + if (!left) + break; + folio = folio_next(folio); } } #endif diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index 708c0fa5d975e4..eac493fa9e0d7d 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -13,10 +13,16 @@ #include #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -#define flush_dcache_page(page) \ -do { \ - clear_bit(PG_arch_1, &(page)->flags); \ -} while (0) +static inline void flush_dcache_folio(struct folio *folio) +{ + clear_bit(PG_arch_1, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_range flush_icache_range diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h index 21c97e31a28aeb..4e5dd800ce1f5c 100644 --- a/arch/ia64/include/asm/pgtable.h +++ b/arch/ia64/include/asm/pgtable.h @@ -206,6 +206,7 @@ ia64_phys_addr_valid (unsigned long addr) #define RGN_MAP_SHIFT (PGDIR_SHIFT + PTRS_PER_PGD_SHIFT - 3) #define RGN_MAP_LIMIT ((1UL << RGN_MAP_SHIFT) - PAGE_SIZE) /* per region addr limit */ +#define PFN_PTE_SHIFT PAGE_SHIFT /* * Conversion functions: convert page frame number (pfn) and a protection value to a page * table entry (pte). @@ -303,8 +304,6 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) *ptep = pteval; } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - /* * Make page protection values cacheable, uncacheable, or write- * combining. Note that "protection" is really a misnomer here as the @@ -396,6 +395,7 @@ pte_same (pte_t a, pte_t b) return pte_val(a) == pte_val(b); } +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do { } while (0) #define update_mmu_cache(vma, address, ptep) do { } while (0) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 7f5353e28516fc..05b0f2f0c07391 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -50,30 +50,44 @@ void __ia64_sync_icache_dcache (pte_t pte) { unsigned long addr; - struct page *page; + struct folio *folio; - page = pte_page(pte); - addr = (unsigned long) page_address(page); + folio = page_folio(pte_page(pte)); + addr = (unsigned long)folio_address(folio); - if (test_bit(PG_arch_1, &page->flags)) + if (test_bit(PG_arch_1, &folio->flags)) return; /* i-cache is already coherent with d-cache */ - flush_icache_range(addr, addr + page_size(page)); - set_bit(PG_arch_1, &page->flags); /* mark page as clean */ + flush_icache_range(addr, addr + folio_size(folio)); + set_bit(PG_arch_1, &folio->flags); /* mark page as clean */ } /* - * Since DMA is i-cache coherent, any (complete) pages that were written via + * Since DMA is i-cache coherent, any (complete) folios that were written via * DMA can be marked as "clean" so that lazy_mmu_prot_update() doesn't have to * flush them when they get mapped into an executable vm-area. */ void arch_dma_mark_clean(phys_addr_t paddr, size_t size) { unsigned long pfn = PHYS_PFN(paddr); + struct folio *folio = page_folio(pfn_to_page(pfn)); + ssize_t left = size; + size_t offset = offset_in_folio(folio, paddr); - do { + if (offset) { + left -= folio_size(folio) - offset; + if (left <= 0) + return; + folio = folio_next(folio); + } + + while (left >= (ssize_t)folio_size(folio)) { + left -= folio_size(folio); set_bit(PG_arch_1, &pfn_to_page(pfn)->flags); - } while (++pfn <= PHYS_PFN(paddr + size - 1)); + if (!left) + break; + folio = folio_next(folio); + } } inline void From a6d01af08b2e40772cf97e700b699850f6862886 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:42 +0100 Subject: [PATCH 443/489] loongarch: implement the new page table range API Add update_mmu_cache_range() and change _PFN_SHIFT to PFN_PTE_SHIFT. It would probably be more efficient to implement __update_tlb() by flushing the entire folio instead of calling __update_tlb() N times, but I'll leave that for someone who understands the architecture better. Link: https://lkml.kernel.org/r/20230802151406.3735276-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Huacai Chen Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/cacheflush.h | 1 + arch/loongarch/include/asm/pgtable-bits.h | 4 +-- arch/loongarch/include/asm/pgtable.h | 33 ++++++++++++----------- arch/loongarch/mm/pgtable.c | 2 +- arch/loongarch/mm/tlb.c | 2 +- 5 files changed, 23 insertions(+), 19 deletions(-) diff --git a/arch/loongarch/include/asm/cacheflush.h b/arch/loongarch/include/asm/cacheflush.h index 0681788eb474aa..88a44da50a3b17 100644 --- a/arch/loongarch/include/asm/cacheflush.h +++ b/arch/loongarch/include/asm/cacheflush.h @@ -47,6 +47,7 @@ void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) #define flush_icache_page(vma, page) do { } while (0) +#define flush_icache_pages(vma, page) do { } while (0) #define flush_icache_user_page(vma, page, addr, len) do { } while (0) #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/loongarch/include/asm/pgtable-bits.h b/arch/loongarch/include/asm/pgtable-bits.h index de46a6b1e9f11c..35348d4c4209ad 100644 --- a/arch/loongarch/include/asm/pgtable-bits.h +++ b/arch/loongarch/include/asm/pgtable-bits.h @@ -50,12 +50,12 @@ #define _PAGE_NO_EXEC (_ULCAST_(1) << _PAGE_NO_EXEC_SHIFT) #define _PAGE_RPLV (_ULCAST_(1) << _PAGE_RPLV_SHIFT) #define _CACHE_MASK (_ULCAST_(3) << _CACHE_SHIFT) -#define _PFN_SHIFT (PAGE_SHIFT - 12 + _PAGE_PFN_SHIFT) +#define PFN_PTE_SHIFT (PAGE_SHIFT - 12 + _PAGE_PFN_SHIFT) #define _PAGE_USER (PLV_USER << _PAGE_PLV_SHIFT) #define _PAGE_KERN (PLV_KERN << _PAGE_PLV_SHIFT) -#define _PFN_MASK (~((_ULCAST_(1) << (_PFN_SHIFT)) - 1) & \ +#define _PFN_MASK (~((_ULCAST_(1) << (PFN_PTE_SHIFT)) - 1) & \ ((_ULCAST_(1) << (_PAGE_PFN_END_SHIFT)) - 1)) /* diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 38afeb7dd58b6b..e7cf25e452c0ff 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -237,9 +237,9 @@ extern pmd_t mk_pmd(struct page *page, pgprot_t prot); extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_pfn(x) ((unsigned long)(((x).pte & _PFN_MASK) >> _PFN_SHIFT)) -#define pfn_pte(pfn, prot) __pte(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)(((x).pte & _PFN_MASK) >> PFN_PTE_SHIFT)) +#define pfn_pte(pfn, prot) __pte(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) /* * Initialize a new pgd / pud / pmd table with invalid pointers. @@ -334,19 +334,13 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) } } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) -{ - set_pte(ptep, pteval); -} - static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { /* Preserve global status for the pair */ if (pte_val(*ptep_buddy(ptep)) & _PAGE_GLOBAL) - set_pte_at(mm, addr, ptep, __pte(_PAGE_GLOBAL)); + set_pte(ptep, __pte(_PAGE_GLOBAL)); else - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, __pte(0)); } #define PGD_T_LOG2 (__builtin_ffs(sizeof(pgd_t)) - 1) @@ -445,11 +439,20 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { - __update_tlb(vma, address, ptep); + for (;;) { + __update_tlb(vma, address, ptep); + if (--nr == 0) + break; + address += PAGE_SIZE; + ptep++; + } } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache @@ -462,7 +465,7 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, static inline unsigned long pmd_pfn(pmd_t pmd) { - return (pmd_val(pmd) & _PFN_MASK) >> _PFN_SHIFT; + return (pmd_val(pmd) & _PFN_MASK) >> PFN_PTE_SHIFT; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/arch/loongarch/mm/pgtable.c b/arch/loongarch/mm/pgtable.c index 5bd102b51f7c81..b14343e211b63f 100644 --- a/arch/loongarch/mm/pgtable.c +++ b/arch/loongarch/mm/pgtable.c @@ -108,7 +108,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (page_to_pfn(page) << _PFN_SHIFT) | pgprot_val(prot); + pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); return pmd; } diff --git a/arch/loongarch/mm/tlb.c b/arch/loongarch/mm/tlb.c index 00bb563e3c8947..eb8572e201ea07 100644 --- a/arch/loongarch/mm/tlb.c +++ b/arch/loongarch/mm/tlb.c @@ -252,7 +252,7 @@ static void output_pgtable_bits_defines(void) pr_define("_PAGE_WRITE_SHIFT %d\n", _PAGE_WRITE_SHIFT); pr_define("_PAGE_NO_READ_SHIFT %d\n", _PAGE_NO_READ_SHIFT); pr_define("_PAGE_NO_EXEC_SHIFT %d\n", _PAGE_NO_EXEC_SHIFT); - pr_define("_PFN_SHIFT %d\n", _PFN_SHIFT); + pr_define("PFN_PTE_SHIFT %d\n", PFN_PTE_SHIFT); pr_debug("\n"); } From 5553b15a4bbba8039e1f31b63642048286f540dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:43 +0100 Subject: [PATCH 444/489] m68k: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). Link: https://lkml.kernel.org/r/20230802151406.3735276-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Geert Uytterhoeven Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- arch/m68k/include/asm/cacheflush_mm.h | 27 ++++++++++++++++-------- arch/m68k/include/asm/mcf_pgtable.h | 1 + arch/m68k/include/asm/motorola_pgtable.h | 1 + arch/m68k/include/asm/pgtable_mm.h | 10 +++++---- arch/m68k/include/asm/sun3_pgtable.h | 1 + arch/m68k/mm/motorola.c | 2 +- 6 files changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1ac55e7b47f01c..88eb85e81ef681 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -220,24 +220,29 @@ static inline void flush_cache_page(struct vm_area_struct *vma, unsigned long vm /* Push the page at kernel virtual address and clear the icache */ /* RZ: use cpush %bc instead of cpush %dc, cinv %ic */ -static inline void __flush_page_to_ram(void *vaddr) +static inline void __flush_pages_to_ram(void *vaddr, unsigned int nr) { if (CPU_IS_COLDFIRE) { unsigned long addr, start, end; addr = ((unsigned long) vaddr) & ~(PAGE_SIZE - 1); start = addr & ICACHE_SET_MASK; - end = (addr + PAGE_SIZE - 1) & ICACHE_SET_MASK; + end = (addr + nr * PAGE_SIZE - 1) & ICACHE_SET_MASK; if (start > end) { flush_cf_bcache(0, end); end = ICACHE_MAX_ADDR; } flush_cf_bcache(start, end); } else if (CPU_IS_040_OR_060) { - __asm__ __volatile__("nop\n\t" - ".chip 68040\n\t" - "cpushp %%bc,(%0)\n\t" - ".chip 68k" - : : "a" (__pa(vaddr))); + unsigned long paddr = __pa(vaddr); + + do { + __asm__ __volatile__("nop\n\t" + ".chip 68040\n\t" + "cpushp %%bc,(%0)\n\t" + ".chip 68k" + : : "a" (paddr)); + paddr += PAGE_SIZE; + } while (--nr); } else { unsigned long _tmp; __asm__ __volatile__("movec %%cacr,%0\n\t" @@ -249,10 +254,14 @@ static inline void __flush_page_to_ram(void *vaddr) } #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -#define flush_dcache_page(page) __flush_page_to_ram(page_address(page)) +#define flush_dcache_page(page) __flush_pages_to_ram(page_address(page), 1) +#define flush_dcache_folio(folio) \ + __flush_pages_to_ram(folio_address(folio), folio_nr_pages(folio)) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) +#define flush_icache_pages(vma, page, nr) \ + __flush_pages_to_ram(page_address(page), nr) +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); diff --git a/arch/m68k/include/asm/mcf_pgtable.h b/arch/m68k/include/asm/mcf_pgtable.h index 43e8da8465f9e4..772b7e7b0654d6 100644 --- a/arch/m68k/include/asm/mcf_pgtable.h +++ b/arch/m68k/include/asm/mcf_pgtable.h @@ -291,6 +291,7 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return pte; } +#define PFN_PTE_SHIFT PAGE_SHIFT #define pmd_pfn(pmd) (pmd_val(pmd) >> PAGE_SHIFT) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) diff --git a/arch/m68k/include/asm/motorola_pgtable.h b/arch/m68k/include/asm/motorola_pgtable.h index ec0dc19ab8343d..38d5e5edc3e13c 100644 --- a/arch/m68k/include/asm/motorola_pgtable.h +++ b/arch/m68k/include/asm/motorola_pgtable.h @@ -112,6 +112,7 @@ static inline void pud_set(pud_t *pudp, pmd_t *pmdp) #define pte_present(pte) (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROTNONE)) #define pte_clear(mm,addr,ptep) ({ pte_val(*(ptep)) = 0; }) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_page(pte) virt_to_page(__va(pte_val(pte))) #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff --git a/arch/m68k/include/asm/pgtable_mm.h b/arch/m68k/include/asm/pgtable_mm.h index b93c41fe206786..dbdf1c2b2f66bc 100644 --- a/arch/m68k/include/asm/pgtable_mm.h +++ b/arch/m68k/include/asm/pgtable_mm.h @@ -31,8 +31,6 @@ do{ \ *(pteptr) = (pteval); \ } while(0) -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - /* PMD_SHIFT determines the size of the area a second-level page table can map */ #if CONFIG_PGTABLE_LEVELS == 3 @@ -138,11 +136,15 @@ extern void kernel_set_cachemode(void *addr, unsigned long size, int cmode); * tables contain all the necessary information. The Sun3 does, but * they are updated on demand. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #endif /* !__ASSEMBLY__ */ /* MMU-specific headers */ diff --git a/arch/m68k/include/asm/sun3_pgtable.h b/arch/m68k/include/asm/sun3_pgtable.h index 9e7bf8a5f8f882..0cc39a88ce5580 100644 --- a/arch/m68k/include/asm/sun3_pgtable.h +++ b/arch/m68k/include/asm/sun3_pgtable.h @@ -105,6 +105,7 @@ static inline void pte_clear (struct mm_struct *mm, unsigned long addr, pte_t *p pte_val (*ptep) = 0; } +#define PFN_PTE_SHIFT 0 #define pte_pfn(pte) (pte_val(pte) & SUN3_PAGE_PGNUM_MASK) #define pfn_pte(pfn, pgprot) \ ({ pte_t __pte; pte_val(__pte) = pfn | pgprot_val(pgprot); __pte; }) diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index 594575a0780c12..c1761d309fc612 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -81,7 +81,7 @@ static inline void cache_page(void *vaddr) void mmu_page_ctor(void *page) { - __flush_page_to_ram(page); + __flush_pages_to_ram(page, 1); flush_tlb_kernel_page(page); nocache_page(page); } From 27a8b944fe91503ba15241d9a8504a34af0009fa Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:44 +0100 Subject: [PATCH 445/489] microblaze: implement the new page table range API Rename PFN_SHIFT_OFFSET to PTE_PFN_SHIFT. Change the calling convention for set_pte() to be the same as other architectures. Add update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). [arnd@arndb.de: mark flush_dcache_folio() inline] Link: https://lkml.kernel.org/r/20230810141947.1236730-9-arnd@kernel.org Link: https://lkml.kernel.org/r/20230802151406.3735276-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Arnd Bergmann Acked-by: Mike Rapoport (IBM) Cc: Michal Simek Signed-off-by: Andrew Morton --- arch/microblaze/include/asm/cacheflush.h | 8 ++++++++ arch/microblaze/include/asm/pgtable.h | 15 ++++----------- arch/microblaze/include/asm/tlbflush.h | 4 +++- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h index 39f8fb6768d8b4..ffa2cf3893e4b5 100644 --- a/arch/microblaze/include/asm/cacheflush.h +++ b/arch/microblaze/include/asm/cacheflush.h @@ -74,6 +74,14 @@ do { \ flush_dcache_range((unsigned) (addr), (unsigned) (addr) + PAGE_SIZE); \ } while (0); +static inline void flush_dcache_folio(struct folio *folio) +{ + unsigned long addr = folio_pfn(folio) << PAGE_SHIFT; + + flush_dcache_range(addr, addr + folio_size(folio)); +} +#define flush_dcache_folio flush_dcache_folio + #define flush_cache_page(vma, vmaddr, pfn) \ flush_dcache_range(pfn << PAGE_SHIFT, (pfn << PAGE_SHIFT) + PAGE_SIZE); diff --git a/arch/microblaze/include/asm/pgtable.h b/arch/microblaze/include/asm/pgtable.h index d1b8272abcd9bf..6f9b99082518e1 100644 --- a/arch/microblaze/include/asm/pgtable.h +++ b/arch/microblaze/include/asm/pgtable.h @@ -230,12 +230,12 @@ extern unsigned long empty_zero_page[1024]; #define pte_page(x) (mem_map + (unsigned long) \ ((pte_val(x) - memory_start) >> PAGE_SHIFT)) -#define PFN_SHIFT_OFFSET (PAGE_SHIFT) +#define PFN_PTE_SHIFT PAGE_SHIFT -#define pte_pfn(x) (pte_val(x) >> PFN_SHIFT_OFFSET) +#define pte_pfn(x) (pte_val(x) >> PFN_PTE_SHIFT) #define pfn_pte(pfn, prot) \ - __pte(((pte_basic_t)(pfn) << PFN_SHIFT_OFFSET) | pgprot_val(prot)) + __pte(((pte_basic_t)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) #ifndef __ASSEMBLY__ /* @@ -330,14 +330,7 @@ static inline unsigned long pte_update(pte_t *p, unsigned long clr, /* * set_pte stores a linux PTE into the linux page table. */ -static inline void set_pte(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - *ptep = pte; -} - -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) +static inline void set_pte(pte_t *ptep, pte_t pte) { *ptep = pte; } diff --git a/arch/microblaze/include/asm/tlbflush.h b/arch/microblaze/include/asm/tlbflush.h index 2038168ed1289a..a31ae9d44083c5 100644 --- a/arch/microblaze/include/asm/tlbflush.h +++ b/arch/microblaze/include/asm/tlbflush.h @@ -33,7 +33,9 @@ static inline void local_flush_tlb_range(struct vm_area_struct *vma, #define flush_tlb_kernel_range(start, end) do { } while (0) -#define update_mmu_cache(vma, addr, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) +#define update_mmu_cache(vma, addr, pte) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define flush_tlb_all local_flush_tlb_all #define flush_tlb_mm local_flush_tlb_mm From 15fa3e8e32692a423209a1808ef098f7ec3174f5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:45 +0100 Subject: [PATCH 446/489] mips: implement the new page table range API Rename _PFN_SHIFT to PFN_PTE_SHIFT. Convert a few places to call set_pte() instead of set_pte_at(). Add set_ptes(), update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/mips/bcm47xx/prom.c | 2 +- arch/mips/include/asm/cacheflush.h | 32 +++++++++----- arch/mips/include/asm/pgtable-32.h | 10 ++--- arch/mips/include/asm/pgtable-64.h | 6 +-- arch/mips/include/asm/pgtable-bits.h | 6 +-- arch/mips/include/asm/pgtable.h | 63 ++++++++++++++++++---------- arch/mips/mm/c-r4k.c | 5 ++- arch/mips/mm/cache.c | 56 ++++++++++++------------- arch/mips/mm/init.c | 21 ++++++---- arch/mips/mm/pgtable-32.c | 2 +- arch/mips/mm/pgtable-64.c | 2 +- arch/mips/mm/tlbex.c | 2 +- 12 files changed, 121 insertions(+), 86 deletions(-) diff --git a/arch/mips/bcm47xx/prom.c b/arch/mips/bcm47xx/prom.c index a9bea411d92827..99a1ba5394e02d 100644 --- a/arch/mips/bcm47xx/prom.c +++ b/arch/mips/bcm47xx/prom.c @@ -116,7 +116,7 @@ void __init prom_init(void) #if defined(CONFIG_BCM47XX_BCMA) && defined(CONFIG_HIGHMEM) #define EXTVBASE 0xc0000000 -#define ENTRYLO(x) ((pte_val(pfn_pte((x) >> _PFN_SHIFT, PAGE_KERNEL_UNCACHED)) >> 6) | 1) +#define ENTRYLO(x) ((pte_val(pfn_pte((x) >> PFN_PTE_SHIFT, PAGE_KERNEL_UNCACHED)) >> 6) | 1) #include diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index d8d3f80f9fc083..0f389bc7cb903a 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -36,12 +36,12 @@ */ #define PG_dcache_dirty PG_arch_1 -#define Page_dcache_dirty(page) \ - test_bit(PG_dcache_dirty, &(page)->flags) -#define SetPageDcacheDirty(page) \ - set_bit(PG_dcache_dirty, &(page)->flags) -#define ClearPageDcacheDirty(page) \ - clear_bit(PG_dcache_dirty, &(page)->flags) +#define folio_test_dcache_dirty(folio) \ + test_bit(PG_dcache_dirty, &(folio)->flags) +#define folio_set_dcache_dirty(folio) \ + set_bit(PG_dcache_dirty, &(folio)->flags) +#define folio_clear_dcache_dirty(folio) \ + clear_bit(PG_dcache_dirty, &(folio)->flags) extern void (*flush_cache_all)(void); extern void (*__flush_cache_all)(void); @@ -50,15 +50,24 @@ extern void (*flush_cache_mm)(struct mm_struct *mm); extern void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); -extern void __flush_dcache_page(struct page *page); +extern void __flush_dcache_pages(struct page *page, unsigned int nr); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 +static inline void flush_dcache_folio(struct folio *folio) +{ + if (cpu_has_dc_aliases) + __flush_dcache_pages(&folio->page, folio_nr_pages(folio)); + else if (!cpu_has_ic_fills_f_dc) + folio_set_dcache_dirty(folio); +} +#define flush_dcache_folio flush_dcache_folio + static inline void flush_dcache_page(struct page *page) { if (cpu_has_dc_aliases) - __flush_dcache_page(page); + __flush_dcache_pages(page, 1); else if (!cpu_has_ic_fills_f_dc) - SetPageDcacheDirty(page); + folio_set_dcache_dirty(page_folio(page)); } #define flush_dcache_mmap_lock(mapping) do { } while (0) @@ -73,10 +82,11 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(page, vmaddr); } -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) { } +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void (*flush_icache_range)(unsigned long start, unsigned long end); extern void (*local_flush_icache_range)(unsigned long start, unsigned long end); diff --git a/arch/mips/include/asm/pgtable-32.h b/arch/mips/include/asm/pgtable-32.h index ba0016709a1a12..0e196650f4f488 100644 --- a/arch/mips/include/asm/pgtable-32.h +++ b/arch/mips/include/asm/pgtable-32.h @@ -153,7 +153,7 @@ static inline void pmd_clear(pmd_t *pmdp) #if defined(CONFIG_XPA) #define MAX_POSSIBLE_PHYSMEM_BITS 40 -#define pte_pfn(x) (((unsigned long)((x).pte_high >> _PFN_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT)) +#define pte_pfn(x) (((unsigned long)((x).pte_high >> PFN_PTE_SHIFT)) | (unsigned long)((x).pte_low << _PAGE_PRESENT_SHIFT)) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) { @@ -161,7 +161,7 @@ pfn_pte(unsigned long pfn, pgprot_t prot) pte.pte_low = (pfn >> _PAGE_PRESENT_SHIFT) | (pgprot_val(prot) & ~_PFNX_MASK); - pte.pte_high = (pfn << _PFN_SHIFT) | + pte.pte_high = (pfn << PFN_PTE_SHIFT) | (pgprot_val(prot) & ~_PFN_MASK); return pte; } @@ -184,9 +184,9 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t prot) #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 -#define pte_pfn(x) ((unsigned long)((x).pte >> _PFN_SHIFT)) -#define pfn_pte(pfn, prot) __pte(((unsigned long long)(pfn) << _PFN_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((unsigned long long)(pfn) << _PFN_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)((x).pte >> PFN_PTE_SHIFT)) +#define pfn_pte(pfn, prot) __pte(((unsigned long long)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((unsigned long long)(pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) #endif /* defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) */ #define pte_page(x) pfn_to_page(pte_pfn(x)) diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 98e24e3e7f2bab..20ca48c1b6063e 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -298,9 +298,9 @@ static inline void pud_clear(pud_t *pudp) #define pte_page(x) pfn_to_page(pte_pfn(x)) -#define pte_pfn(x) ((unsigned long)((x).pte >> _PFN_SHIFT)) -#define pfn_pte(pfn, prot) __pte(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) -#define pfn_pmd(pfn, prot) __pmd(((pfn) << _PFN_SHIFT) | pgprot_val(prot)) +#define pte_pfn(x) ((unsigned long)((x).pte >> PFN_PTE_SHIFT)) +#define pfn_pte(pfn, prot) __pte(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) +#define pfn_pmd(pfn, prot) __pmd(((pfn) << PFN_PTE_SHIFT) | pgprot_val(prot)) #ifndef __PAGETABLE_PMD_FOLDED static inline pmd_t *pud_pgtable(pud_t pud) diff --git a/arch/mips/include/asm/pgtable-bits.h b/arch/mips/include/asm/pgtable-bits.h index 1c576679aa8768..421e78c30253cb 100644 --- a/arch/mips/include/asm/pgtable-bits.h +++ b/arch/mips/include/asm/pgtable-bits.h @@ -182,10 +182,10 @@ enum pgtable_bits { #if defined(CONFIG_CPU_R3K_TLB) # define _CACHE_UNCACHED (1 << _CACHE_UNCACHED_SHIFT) # define _CACHE_MASK _CACHE_UNCACHED -# define _PFN_SHIFT PAGE_SHIFT +# define PFN_PTE_SHIFT PAGE_SHIFT #else # define _CACHE_MASK (7 << _CACHE_SHIFT) -# define _PFN_SHIFT (PAGE_SHIFT - 12 + _CACHE_SHIFT + 3) +# define PFN_PTE_SHIFT (PAGE_SHIFT - 12 + _CACHE_SHIFT + 3) #endif #ifndef _PAGE_NO_EXEC @@ -195,7 +195,7 @@ enum pgtable_bits { #define _PAGE_SILENT_READ _PAGE_VALID #define _PAGE_SILENT_WRITE _PAGE_DIRTY -#define _PFN_MASK (~((1 << (_PFN_SHIFT)) - 1)) +#define _PFN_MASK (~((1 << (PFN_PTE_SHIFT)) - 1)) /* * The final layouts of the PTE bits are: diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 574fa14ac8b257..cbb93a834f52d9 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -66,7 +66,7 @@ extern void paging_init(void); static inline unsigned long pmd_pfn(pmd_t pmd) { - return pmd_val(pmd) >> _PFN_SHIFT; + return pmd_val(pmd) >> PFN_PTE_SHIFT; } #ifndef CONFIG_MIPS_HUGE_TLB_SUPPORT @@ -105,9 +105,6 @@ do { \ } \ } while(0) -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - #if defined(CONFIG_PHYS_ADDR_T_64BIT) && defined(CONFIG_CPU_MIPS32) #ifdef CONFIG_XPA @@ -157,7 +154,7 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt null.pte_low = null.pte_high = _PAGE_GLOBAL; } - set_pte_at(mm, addr, ptep, null); + set_pte(ptep, null); htw_start(); } #else @@ -196,28 +193,41 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt #if !defined(CONFIG_CPU_R3K_TLB) /* Preserve global status for the pair */ if (pte_val(*ptep_buddy(ptep)) & _PAGE_GLOBAL) - set_pte_at(mm, addr, ptep, __pte(_PAGE_GLOBAL)); + set_pte(ptep, __pte(_PAGE_GLOBAL)); else #endif - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, __pte(0)); htw_start(); } #endif -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { + unsigned int i; + bool do_sync = false; - if (!pte_present(pteval)) - goto cache_sync_done; + for (i = 0; i < nr; i++) { + if (!pte_present(pte)) + continue; + if (pte_present(ptep[i]) && + (pte_pfn(ptep[i]) == pte_pfn(pte))) + continue; + do_sync = true; + } - if (pte_present(*ptep) && (pte_pfn(*ptep) == pte_pfn(pteval))) - goto cache_sync_done; + if (do_sync) + __update_cache(addr, pte); - __update_cache(addr, pteval); -cache_sync_done: - set_pte(ptep, pteval); + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT)); + } } +#define set_ptes set_ptes /* * (pmds are folded into puds so this doesn't get actually called, @@ -486,7 +496,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { if (!pte_same(*ptep, entry)) - set_pte_at(vma->vm_mm, address, ptep, entry); + set_pte(ptep, entry); /* * update_mmu_cache will unconditionally execute, handling both * the case that the PTE changed and the spurious fault case. @@ -568,12 +578,21 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) -{ - pte_t pte = *ptep; - __update_tlb(vma, address, pte); +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + for (;;) { + pte_t pte = *ptep; + __update_tlb(vma, address, pte); + if (--nr == 0) + break; + ptep++; + address += PAGE_SIZE; + } } +#define update_mmu_cache(vma, address, ptep) \ + update_mmu_cache_range(NULL, vma, address, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 4b6554b4892369..187d1c16361cfa 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -568,13 +568,14 @@ static inline void local_r4k_flush_cache_page(void *args) if ((mm == current->active_mm) && (pte_val(*ptep) & _PAGE_VALID)) vaddr = NULL; else { + struct folio *folio = page_folio(page); /* * Use kmap_coherent or kmap_atomic to do flushes for * another ASID than the current one. */ map_coherent = (cpu_has_dc_aliases && - page_mapcount(page) && - !Page_dcache_dirty(page)); + folio_mapped(folio) && + !folio_test_dcache_dirty(folio)); if (map_coherent) vaddr = kmap_coherent(page, addr); else diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index d21cf8c6cf6c55..02042100e26718 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -99,13 +99,15 @@ SYSCALL_DEFINE3(cacheflush, unsigned long, addr, unsigned long, bytes, return 0; } -void __flush_dcache_page(struct page *page) +void __flush_dcache_pages(struct page *page, unsigned int nr) { - struct address_space *mapping = page_mapping_file(page); + struct folio *folio = page_folio(page); + struct address_space *mapping = folio_flush_mapping(folio); unsigned long addr; + unsigned int i; if (mapping && !mapping_mapped(mapping)) { - SetPageDcacheDirty(page); + folio_set_dcache_dirty(folio); return; } @@ -114,25 +116,21 @@ void __flush_dcache_page(struct page *page) * case is for exec env/arg pages and those are %99 certainly going to * get faulted into the tlb (and thus flushed) anyways. */ - if (PageHighMem(page)) - addr = (unsigned long)kmap_atomic(page); - else - addr = (unsigned long)page_address(page); - - flush_data_cache_page(addr); - - if (PageHighMem(page)) - kunmap_atomic((void *)addr); + for (i = 0; i < nr; i++) { + addr = (unsigned long)kmap_local_page(page + i); + flush_data_cache_page(addr); + kunmap_local((void *)addr); + } } - -EXPORT_SYMBOL(__flush_dcache_page); +EXPORT_SYMBOL(__flush_dcache_pages); void __flush_anon_page(struct page *page, unsigned long vmaddr) { unsigned long addr = (unsigned long) page_address(page); + struct folio *folio = page_folio(page); if (pages_do_alias(addr, vmaddr)) { - if (page_mapcount(page) && !Page_dcache_dirty(page)) { + if (folio_mapped(folio) && !folio_test_dcache_dirty(folio)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); @@ -147,27 +145,29 @@ EXPORT_SYMBOL(__flush_anon_page); void __update_cache(unsigned long address, pte_t pte) { - struct page *page; + struct folio *folio; unsigned long pfn, addr; int exec = !pte_no_exec(pte) && !cpu_has_ic_fills_f_dc; + unsigned int i; pfn = pte_pfn(pte); if (unlikely(!pfn_valid(pfn))) return; - page = pfn_to_page(pfn); - if (Page_dcache_dirty(page)) { - if (PageHighMem(page)) - addr = (unsigned long)kmap_atomic(page); - else - addr = (unsigned long)page_address(page); - - if (exec || pages_do_alias(addr, address & PAGE_MASK)) - flush_data_cache_page(addr); - if (PageHighMem(page)) - kunmap_atomic((void *)addr); + folio = page_folio(pfn_to_page(pfn)); + address &= PAGE_MASK; + address -= offset_in_folio(folio, pfn << PAGE_SHIFT); + + if (folio_test_dcache_dirty(folio)) { + for (i = 0; i < folio_nr_pages(folio); i++) { + addr = (unsigned long)kmap_local_folio(folio, i); - ClearPageDcacheDirty(page); + if (exec || pages_do_alias(addr, address)) + flush_data_cache_page(addr); + kunmap_local((void *)addr); + address += PAGE_SIZE; + } + folio_clear_dcache_dirty(folio); } } diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 5a8002839550e1..5dcb525a899543 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -88,7 +88,7 @@ static void *__kmap_pgprot(struct page *page, unsigned long addr, pgprot_t prot) pte_t pte; int tlbidx; - BUG_ON(Page_dcache_dirty(page)); + BUG_ON(folio_test_dcache_dirty(page_folio(page))); preempt_disable(); pagefault_disable(); @@ -169,11 +169,12 @@ void kunmap_coherent(void) void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *vfrom, *vto; vto = kmap_atomic(to); if (cpu_has_dc_aliases && - page_mapcount(from) && !Page_dcache_dirty(from)) { + folio_mapped(src) && !folio_test_dcache_dirty(src)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(); @@ -194,15 +195,17 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { + struct folio *folio = page_folio(page); + if (cpu_has_dc_aliases && - page_mapcount(page) && !Page_dcache_dirty(page)) { + folio_mapped(folio) && !folio_test_dcache_dirty(folio)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(); } else { memcpy(dst, src, len); if (cpu_has_dc_aliases) - SetPageDcacheDirty(page); + folio_set_dcache_dirty(folio); } if (vma->vm_flags & VM_EXEC) flush_cache_page(vma, vaddr, page_to_pfn(page)); @@ -212,15 +215,17 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { + struct folio *folio = page_folio(page); + if (cpu_has_dc_aliases && - page_mapcount(page) && !Page_dcache_dirty(page)) { + folio_mapped(folio) && !folio_test_dcache_dirty(folio)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(); } else { memcpy(dst, src, len); if (cpu_has_dc_aliases) - SetPageDcacheDirty(page); + folio_set_dcache_dirty(folio); } } EXPORT_SYMBOL_GPL(copy_from_user_page); @@ -448,10 +453,10 @@ static inline void __init mem_init_free_highmem(void) void __init mem_init(void) { /* - * When _PFN_SHIFT is greater than PAGE_SHIFT we won't have enough PTE + * When PFN_PTE_SHIFT is greater than PAGE_SHIFT we won't have enough PTE * bits to hold a full 32b physical address on MIPS32 systems. */ - BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (_PFN_SHIFT > PAGE_SHIFT)); + BUILD_BUG_ON(IS_ENABLED(CONFIG_32BIT) && (PFN_PTE_SHIFT > PAGE_SHIFT)); #ifdef CONFIG_HIGHMEM max_mapnr = highend_pfn ? highend_pfn : max_low_pfn; diff --git a/arch/mips/mm/pgtable-32.c b/arch/mips/mm/pgtable-32.c index f57fb69472f847..84dd5136d53a6d 100644 --- a/arch/mips/mm/pgtable-32.c +++ b/arch/mips/mm/pgtable-32.c @@ -35,7 +35,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (page_to_pfn(page) << _PFN_SHIFT) | pgprot_val(prot); + pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); return pmd; } diff --git a/arch/mips/mm/pgtable-64.c b/arch/mips/mm/pgtable-64.c index b4386a0e2ef871..c76d21f7dffbb1 100644 --- a/arch/mips/mm/pgtable-64.c +++ b/arch/mips/mm/pgtable-64.c @@ -93,7 +93,7 @@ pmd_t mk_pmd(struct page *page, pgprot_t prot) { pmd_t pmd; - pmd_val(pmd) = (page_to_pfn(page) << _PFN_SHIFT) | pgprot_val(prot); + pmd_val(pmd) = (page_to_pfn(page) << PFN_PTE_SHIFT) | pgprot_val(prot); return pmd; } diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 8d514a9082c623..b4e1c783e61779 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -253,7 +253,7 @@ static void output_pgtable_bits_defines(void) pr_define("_PAGE_GLOBAL_SHIFT %d\n", _PAGE_GLOBAL_SHIFT); pr_define("_PAGE_VALID_SHIFT %d\n", _PAGE_VALID_SHIFT); pr_define("_PAGE_DIRTY_SHIFT %d\n", _PAGE_DIRTY_SHIFT); - pr_define("_PFN_SHIFT %d\n", _PFN_SHIFT); + pr_define("PFN_PTE_SHIFT %d\n", PFN_PTE_SHIFT); pr_debug("\n"); } From 994209410919f2b84b7e4ab2e78785d9715308ad Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:46 +0100 Subject: [PATCH 447/489] nios2: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_icache_pages() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Dinh Nguyen Signed-off-by: Andrew Morton --- arch/nios2/include/asm/cacheflush.h | 6 ++- arch/nios2/include/asm/pgtable.h | 28 ++++++---- arch/nios2/mm/cacheflush.c | 79 ++++++++++++++++------------- 3 files changed, 67 insertions(+), 46 deletions(-) diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index d0b71dd7128724..8624ca83cffe1b 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -29,9 +29,13 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, unsigned long pfn); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio extern void flush_icache_range(unsigned long start, unsigned long end); -extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr); +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1); #define flush_cache_vmap(start, end) flush_dcache_range(start, end) #define flush_cache_vunmap(start, end) flush_dcache_range(start, end) diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h index 0f5c2564e9f59e..be6bf3e0bd7a8f 100644 --- a/arch/nios2/include/asm/pgtable.h +++ b/arch/nios2/include/asm/pgtable.h @@ -178,14 +178,21 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) *ptep = pteval; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) { - unsigned long paddr = (unsigned long)page_to_virt(pte_page(pteval)); - - flush_dcache_range(paddr, paddr + PAGE_SIZE); - set_pte(ptep, pteval); + unsigned long paddr = (unsigned long)page_to_virt(pte_page(pte)); + + flush_dcache_range(paddr, paddr + nr * PAGE_SIZE); + for (;;) { + set_pte(ptep, pte); + if (--nr == 0) + break; + ptep++; + pte_val(pte) += 1; + } } +#define set_ptes set_ptes static inline int pmd_none(pmd_t pmd) { @@ -202,7 +209,7 @@ static inline void pte_clear(struct mm_struct *mm, pte_val(null) = (addr >> PAGE_SHIFT) & 0xf; - set_pte_at(mm, addr, ptep, null); + set_pte(ptep, null); } /* @@ -273,7 +280,10 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) extern void __init paging_init(void); extern void __init mmu_init(void); -extern void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *pte); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); + +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #endif /* _ASM_NIOS2_PGTABLE_H */ diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 6aa9257c3ede42..28b805f465a8b7 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -71,26 +71,26 @@ static void __flush_icache(unsigned long start, unsigned long end) __asm__ __volatile(" flushp\n"); } -static void flush_aliases(struct address_space *mapping, struct page *page) +static void flush_aliases(struct address_space *mapping, struct folio *folio) { struct mm_struct *mm = current->active_mm; - struct vm_area_struct *mpnt; + struct vm_area_struct *vma; pgoff_t pgoff; + unsigned long nr = folio_nr_pages(folio); - pgoff = page->index; + pgoff = folio->index; flush_dcache_mmap_lock(mapping); - vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { - unsigned long offset; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) { + unsigned long start; - if (mpnt->vm_mm != mm) + if (vma->vm_mm != mm) continue; - if (!(mpnt->vm_flags & VM_MAYSHARE)) + if (!(vma->vm_flags & VM_MAYSHARE)) continue; - offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - flush_cache_page(mpnt, mpnt->vm_start + offset, - page_to_pfn(page)); + start = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + flush_cache_range(vma, start, start + nr * PAGE_SIZE); } flush_dcache_mmap_unlock(mapping); } @@ -138,10 +138,11 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, __flush_icache(start, end); } -void flush_icache_page(struct vm_area_struct *vma, struct page *page) +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr) { unsigned long start = (unsigned long) page_address(page); - unsigned long end = start + PAGE_SIZE; + unsigned long end = start + nr * PAGE_SIZE; __flush_dcache(start, end); __flush_icache(start, end); @@ -158,19 +159,19 @@ void flush_cache_page(struct vm_area_struct *vma, unsigned long vmaddr, __flush_icache(start, end); } -void __flush_dcache_page(struct address_space *mapping, struct page *page) +static void __flush_dcache_folio(struct folio *folio) { /* * Writeback any data associated with the kernel mapping of this * page. This ensures that data in the physical page is mutually * coherent with the kernels mapping. */ - unsigned long start = (unsigned long)page_address(page); + unsigned long start = (unsigned long)folio_address(folio); - __flush_dcache(start, start + PAGE_SIZE); + __flush_dcache(start, start + folio_size(folio)); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { struct address_space *mapping; @@ -178,32 +179,38 @@ void flush_dcache_page(struct page *page) * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(folio_pfn(folio))) return; - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); /* Flush this page if there are aliases. */ if (mapping && !mapping_mapped(mapping)) { - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); } else { - __flush_dcache_page(mapping, page); + __flush_dcache_folio(folio); if (mapping) { - unsigned long start = (unsigned long)page_address(page); - flush_aliases(mapping, page); - flush_icache_range(start, start + PAGE_SIZE); + unsigned long start = (unsigned long)folio_address(folio); + flush_aliases(mapping, folio); + flush_icache_range(start, start + folio_size(folio)); } - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } +EXPORT_SYMBOL(flush_dcache_folio); + +void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} EXPORT_SYMBOL(flush_dcache_page); -void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { pte_t pte = *ptep; unsigned long pfn = pte_pfn(pte); - struct page *page; + struct folio *folio; struct address_space *mapping; reload_tlb_page(vma, address, pte); @@ -215,19 +222,19 @@ void update_mmu_cache(struct vm_area_struct *vma, * The zero page is never written to, so never has any dirty * cache lines, and therefore never needs to be flushed. */ - page = pfn_to_page(pfn); - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; - mapping = page_mapping_file(page); - if (!test_and_set_bit(PG_dcache_clean, &page->flags)) - __flush_dcache_page(mapping, page); + folio = page_folio(pfn_to_page(pfn)); + if (!test_and_set_bit(PG_dcache_clean, &folio->flags)) + __flush_dcache_folio(folio); - if(mapping) - { - flush_aliases(mapping, page); + mapping = folio_flush_mapping(folio); + if (mapping) { + flush_aliases(mapping, folio); if (vma->vm_flags & VM_EXEC) - flush_icache_page(vma, page); + flush_icache_pages(vma, &folio->page, + folio_nr_pages(folio)); } } From 063e409dcc37a5834fe94342b3cbcfe17d094eed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:47 +0100 Subject: [PATCH 448/489] openrisc: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Stafford Horne Signed-off-by: Andrew Morton --- arch/openrisc/include/asm/cacheflush.h | 8 +++++++- arch/openrisc/include/asm/pgtable.h | 15 ++++++++++----- arch/openrisc/mm/cache.c | 12 ++++++++---- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index eeac40d4a8547f..984c331ff5f474 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -56,10 +56,16 @@ static inline void sync_icache_dcache(struct page *page) */ #define PG_dc_clean PG_arch_1 +static inline void flush_dcache_folio(struct folio *folio) +{ + clear_bit(PG_dc_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 static inline void flush_dcache_page(struct page *page) { - clear_bit(PG_dc_clean, &page->flags); + flush_dcache_folio(page_folio(page)); } #define flush_icache_user_page(vma, page, addr, len) \ diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 3eb9b9555d0df6..7bdf1bb0d17743 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -46,7 +46,7 @@ extern void paging_init(void); * hook is made available. */ #define set_pte(pteptr, pteval) ((*(pteptr)) = (pteval)) -#define set_pte_at(mm, addr, ptep, pteval) set_pte(ptep, pteval) + /* * (pmds are folded into pgds so this doesn't get actually called, * but the define is needed for a generic inline function.) @@ -357,6 +357,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) #define __pmd_offset(address) \ (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_pfn(x) ((unsigned long)(((x).pte)) >> PAGE_SHIFT) #define pfn_pte(pfn, prot) __pte((((pfn) << PAGE_SHIFT)) | pgprot_val(prot)) @@ -379,13 +380,17 @@ static inline void update_tlb(struct vm_area_struct *vma, extern void update_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte); -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *pte) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { - update_tlb(vma, address, pte); - update_cache(vma, address, pte); + update_tlb(vma, address, ptep); + update_cache(vma, address, ptep); } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + /* __PHX__ FIXME, SWAP, this probably doesn't work */ /* diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index 534a52ec5e6678..eb43b73f385580 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -43,15 +43,19 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { unsigned long pfn = pte_val(*pte) >> PAGE_SHIFT; - struct page *page = pfn_to_page(pfn); - int dirty = !test_and_set_bit(PG_dc_clean, &page->flags); + struct folio *folio = page_folio(pfn_to_page(pfn)); + int dirty = !test_and_set_bit(PG_dc_clean, &folio->flags); /* * Since icaches do not snoop for updated data on OpenRISC, we * must write back and invalidate any dirty pages manually. We * can skip data pages, since they will not end up in icaches. */ - if ((vma->vm_flags & VM_EXEC) && dirty) - sync_icache_dcache(page); + if ((vma->vm_flags & VM_EXEC) && dirty) { + unsigned int nr = folio_nr_pages(folio); + + while (nr--) + sync_icache_dcache(folio_page(folio, nr)); + } } From e70bbca607424dbb236cc641adba39c2cc0d65c5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:48 +0100 Subject: [PATCH 449/489] parisc: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: "James E.J. Bottomley" Cc: Helge Deller Signed-off-by: Andrew Morton --- arch/parisc/include/asm/cacheflush.h | 14 ++-- arch/parisc/include/asm/pgtable.h | 37 +++++---- arch/parisc/kernel/cache.c | 107 ++++++++++++++++++--------- 3 files changed, 105 insertions(+), 53 deletions(-) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index c8b6928cee1ee4..b77c3e0c37d381 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -43,8 +43,13 @@ void invalidate_kernel_vmap_range(void *vaddr, int size); #define flush_cache_vmap(start, end) flush_cache_all() #define flush_cache_vunmap(start, end) flush_cache_all() +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *page); +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) @@ -53,10 +58,9 @@ void flush_dcache_page(struct page *page); #define flush_dcache_mmap_unlock_irqrestore(mapping, flags) \ xa_unlock_irqrestore(&mapping->i_pages, flags) -#define flush_icache_page(vma,page) do { \ - flush_kernel_dcache_page_addr(page_address(page)); \ - flush_kernel_icache_page(page_address(page)); \ -} while (0) +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr); +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) #define flush_icache_range(s,e) do { \ flush_kernel_dcache_range_asm(s,e); \ diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 5656395c95eef1..ce38bb375b6029 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -73,15 +73,6 @@ extern void __update_cache(pte_t pte); mb(); \ } while(0) -#define set_pte_at(mm, addr, pteptr, pteval) \ - do { \ - if (pte_present(pteval) && \ - pte_user(pteval)) \ - __update_cache(pteval); \ - *(pteptr) = (pteval); \ - purge_tlb_entries(mm, addr); \ - } while (0) - #endif /* !__ASSEMBLY__ */ #define pte_ERROR(e) \ @@ -285,7 +276,7 @@ extern unsigned long *empty_zero_page; #define pte_none(x) (pte_val(x) == 0) #define pte_present(x) (pte_val(x) & _PAGE_PRESENT) #define pte_user(x) (pte_val(x) & _PAGE_USER) -#define pte_clear(mm, addr, xp) set_pte_at(mm, addr, xp, __pte(0)) +#define pte_clear(mm, addr, xp) set_pte(xp, __pte(0)) #define pmd_flag(x) (pmd_val(x) & PxD_FLAG_MASK) #define pmd_address(x) ((unsigned long)(pmd_val(x) &~ PxD_FLAG_MASK) << PxD_VALUE_SHIFT) @@ -391,11 +382,29 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) extern void paging_init (void); +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + if (pte_present(pte) && pte_user(pte)) + __update_cache(pte); + for (;;) { + *ptep = pte; + purge_tlb_entries(mm, addr); + if (--nr == 0) + break; + ptep++; + pte_val(pte) += 1 << PFN_PTE_SHIFT; + addr += PAGE_SIZE; + } +} +#define set_ptes set_ptes + /* Used for deferring calls to flush_dcache_page() */ #define PG_dcache_dirty PG_arch_1 -#define update_mmu_cache(vms,addr,ptep) __update_cache(*ptep) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) __update_cache(*ptep) +#define update_mmu_cache(vma, addr, ptep) __update_cache(*ptep) /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that @@ -450,7 +459,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned if (!pte_young(pte)) { return 0; } - set_pte_at(vma->vm_mm, addr, ptep, pte_mkold(pte)); + set_pte(ptep, pte_mkold(pte)); return 1; } @@ -460,14 +469,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t old_pte; old_pte = *ptep; - set_pte_at(mm, addr, ptep, __pte(0)); + set_pte(ptep, __pte(0)); return old_pte; } static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - set_pte_at(mm, addr, ptep, pte_wrprotect(*ptep)); + set_pte(ptep, pte_wrprotect(*ptep)); } #define pte_same(A,B) (pte_val(A) == pte_val(B)) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index b55b35c89d6ac1..442109a489406f 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -94,11 +94,11 @@ static inline void flush_data_cache(void) /* Kernel virtual address of pfn. */ #define pfn_va(pfn) __va(PFN_PHYS(pfn)) -void -__update_cache(pte_t pte) +void __update_cache(pte_t pte) { unsigned long pfn = pte_pfn(pte); - struct page *page; + struct folio *folio; + unsigned int nr; /* We don't have pte special. As a result, we can be called with an invalid pfn and we don't need to flush the kernel dcache page. @@ -106,13 +106,17 @@ __update_cache(pte_t pte) if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); - if (page_mapping_file(page) && - test_bit(PG_dcache_dirty, &page->flags)) { - flush_kernel_dcache_page_addr(pfn_va(pfn)); - clear_bit(PG_dcache_dirty, &page->flags); + folio = page_folio(pfn_to_page(pfn)); + pfn = folio_pfn(folio); + nr = folio_nr_pages(folio); + if (folio_flush_mapping(folio) && + test_bit(PG_dcache_dirty, &folio->flags)) { + while (nr--) + flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); + clear_bit(PG_dcache_dirty, &folio->flags); } else if (parisc_requires_coherency()) - flush_kernel_dcache_page_addr(pfn_va(pfn)); + while (nr--) + flush_kernel_dcache_page_addr(pfn_va(pfn + nr)); } void @@ -366,6 +370,20 @@ static void flush_user_cache_page(struct vm_area_struct *vma, unsigned long vmad preempt_enable(); } +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr) +{ + void *kaddr = page_address(page); + + for (;;) { + flush_kernel_dcache_page_addr(kaddr); + flush_kernel_icache_page(kaddr); + if (--nr == 0) + break; + kaddr += PAGE_SIZE; + } +} + static inline pte_t *get_ptep(struct mm_struct *mm, unsigned long addr) { pte_t *ptep = NULL; @@ -394,27 +412,30 @@ static inline bool pte_needs_flush(pte_t pte) == (_PAGE_PRESENT | _PAGE_ACCESSED); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - struct address_space *mapping = page_mapping_file(page); - struct vm_area_struct *mpnt; - unsigned long offset; + struct address_space *mapping = folio_flush_mapping(folio); + struct vm_area_struct *vma; unsigned long addr, old_addr = 0; + void *kaddr; unsigned long count = 0; - unsigned long flags; + unsigned long i, nr, flags; pgoff_t pgoff; if (mapping && !mapping_mapped(mapping)) { - set_bit(PG_dcache_dirty, &page->flags); + set_bit(PG_dcache_dirty, &folio->flags); return; } - flush_kernel_dcache_page_addr(page_address(page)); + nr = folio_nr_pages(folio); + kaddr = folio_address(folio); + for (i = 0; i < nr; i++) + flush_kernel_dcache_page_addr(kaddr + i * PAGE_SIZE); if (!mapping) return; - pgoff = page->index; + pgoff = folio->index; /* * We have carefully arranged in arch_get_unmapped_area() that @@ -424,20 +445,33 @@ void flush_dcache_page(struct page *page) * on machines that support equivalent aliasing */ flush_dcache_mmap_lock_irqsave(mapping, flags); - vma_interval_tree_foreach(mpnt, &mapping->i_mmap, pgoff, pgoff) { - offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - addr = mpnt->vm_start + offset; - if (parisc_requires_coherency()) { - bool needs_flush = false; - pte_t *ptep; + vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) { + unsigned long offset = pgoff - vma->vm_pgoff; + unsigned long pfn = folio_pfn(folio); + + addr = vma->vm_start; + nr = folio_nr_pages(folio); + if (offset > -nr) { + pfn -= offset; + nr += offset; + } else { + addr += offset * PAGE_SIZE; + } + if (addr + nr * PAGE_SIZE > vma->vm_end) + nr = (vma->vm_end - addr) / PAGE_SIZE; - ptep = get_ptep(mpnt->vm_mm, addr); - if (ptep) { - needs_flush = pte_needs_flush(*ptep); + if (parisc_requires_coherency()) { + for (i = 0; i < nr; i++) { + pte_t *ptep = get_ptep(vma->vm_mm, + addr + i * PAGE_SIZE); + if (!ptep) + continue; + if (pte_needs_flush(*ptep)) + flush_user_cache_page(vma, + addr + i * PAGE_SIZE); + /* Optimise accesses to the same table? */ pte_unmap(ptep); } - if (needs_flush) - flush_user_cache_page(mpnt, addr); } else { /* * The TLB is the engine of coherence on parisc: @@ -450,27 +484,32 @@ void flush_dcache_page(struct page *page) * in (until the user or kernel specifically * accesses it, of course) */ - flush_tlb_page(mpnt, addr); + for (i = 0; i < nr; i++) + flush_tlb_page(vma, addr + i * PAGE_SIZE); if (old_addr == 0 || (old_addr & (SHM_COLOUR - 1)) != (addr & (SHM_COLOUR - 1))) { - __flush_cache_page(mpnt, addr, page_to_phys(page)); + for (i = 0; i < nr; i++) + __flush_cache_page(vma, + addr + i * PAGE_SIZE, + (pfn + i) * PAGE_SIZE); /* * Software is allowed to have any number * of private mappings to a page. */ - if (!(mpnt->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_SHARED)) continue; if (old_addr) pr_err("INEQUIVALENT ALIASES 0x%lx and 0x%lx in file %pD\n", - old_addr, addr, mpnt->vm_file); - old_addr = addr; + old_addr, addr, vma->vm_file); + if (nr == folio_nr_pages(folio)) + old_addr = addr; } } WARN_ON(++count == 4096); } flush_dcache_mmap_unlock_irqrestore(mapping, flags); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); /* Defined in arch/parisc/kernel/pacache.S */ EXPORT_SYMBOL(flush_kernel_dcache_range_asm); From 9fee28baa601f4dbf869b1373183b312d2d5ef3d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:49 +0100 Subject: [PATCH 450/489] powerpc: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_arch_1 (aka PG_dcache_dirty) flag from being per-page to per-folio. [willy@infradead.org: re-export flush_dcache_icache_folio()] Link: https://lkml.kernel.org/r/ZMx1daYwvD9EM7Cv@casper.infradead.org Link: https://lkml.kernel.org/r/20230802151406.3735276-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Christophe Leroy Signed-off-by: Andrew Morton --- arch/powerpc/include/asm/book3s/32/pgtable.h | 5 -- arch/powerpc/include/asm/book3s/64/pgtable.h | 6 +-- arch/powerpc/include/asm/book3s/pgtable.h | 11 ++-- arch/powerpc/include/asm/cacheflush.h | 14 ++++-- arch/powerpc/include/asm/kvm_ppc.h | 10 ++-- arch/powerpc/include/asm/nohash/pgtable.h | 16 ++---- arch/powerpc/include/asm/pgtable.h | 12 +++++ arch/powerpc/mm/book3s64/hash_utils.c | 11 ++-- arch/powerpc/mm/cacheflush.c | 41 ++++++--------- arch/powerpc/mm/nohash/e500_hugetlbpage.c | 3 +- arch/powerpc/mm/pgtable.c | 53 ++++++++++++-------- 11 files changed, 89 insertions(+), 93 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 7bf1fe7297c638..5f12b938290987 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -462,11 +462,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) pgprot_val(pgprot)); } -static inline unsigned long pte_pfn(pte_t pte) -{ - return pte_val(pte) >> PTE_RPN_SHIFT; -} - /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index a8204566cfd0f3..8269b231c53392 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -104,6 +104,7 @@ * and every thing below PAGE_SHIFT; */ #define PTE_RPN_MASK (((1UL << _PAGE_PA_MAX) - 1) & (PAGE_MASK)) +#define PTE_RPN_SHIFT PAGE_SHIFT /* * set of bits not changed in pmd_modify. Even though we have hash specific bits * in here, on radix we expect them to be zero. @@ -569,11 +570,6 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) return __pte(((pte_basic_t)pfn << PAGE_SHIFT) | pgprot_val(pgprot) | _PAGE_PTE); } -static inline unsigned long pte_pfn(pte_t pte) -{ - return (pte_val(pte) & PTE_RPN_MASK) >> PAGE_SHIFT; -} - /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index d18b748ea3ae0f..3b7bd36a23210b 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -9,13 +9,6 @@ #endif #ifndef __ASSEMBLY__ -/* Insert a PTE, top-level function is out of line. It uses an inline - * low level function in the respective pgtable-* files - */ -extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte); - - #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); @@ -36,7 +29,9 @@ void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * corresponding HPTE into the hash table ahead of time, instead of * waiting for the inevitable extra hash-table miss exception. */ -static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { if (IS_ENABLED(CONFIG_PPC32) && !mmu_has_feature(MMU_FTR_HPTE_TABLE)) return; diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index 7564dd4fd12b7e..ef7d2de33b89e5 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -35,13 +35,19 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) * It just marks the page as not i-cache clean. We do the i-cache * flush later when the page is given to a user process, if necessary. */ -static inline void flush_dcache_page(struct page *page) +static inline void flush_dcache_folio(struct folio *folio) { if (cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) return; /* avoid an atomic op if possible */ - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); +} +#define flush_dcache_folio flush_dcache_folio + +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); } void flush_icache_range(unsigned long start, unsigned long stop); @@ -51,7 +57,7 @@ void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); #define flush_icache_user_page flush_icache_user_page -void flush_dcache_icache_page(struct page *page); +void flush_dcache_icache_folio(struct folio *folio); /** * flush_dcache_range(): Write any modified data cache blocks out to memory and diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index d16d80ad2ae42f..b4da8514af439b 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -894,7 +894,7 @@ void kvmppc_init_lpid(unsigned long nr_lpids); static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) { - struct page *page; + struct folio *folio; /* * We can only access pages that the kernel maps * as memory. Bail out for unmapped ones. @@ -903,10 +903,10 @@ static inline void kvmppc_mmu_flush_icache(kvm_pfn_t pfn) return; /* Clear i-cache for new pages */ - page = pfn_to_page(pfn); - if (!test_bit(PG_dcache_clean, &page->flags)) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + folio = page_folio(pfn_to_page(pfn)); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index a6caaaab6f9222..56ea48276356c6 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -101,8 +101,6 @@ static inline bool pte_access_permitted(pte_t pte, bool write) static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) { return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) | pgprot_val(pgprot)); } -static inline unsigned long pte_pfn(pte_t pte) { - return pte_val(pte) >> PTE_RPN_SHIFT; } /* Generic modifiers for PTE bits */ static inline pte_t pte_exprotect(pte_t pte) @@ -166,12 +164,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) return __pte(pte_val(pte) & ~_PAGE_SWP_EXCLUSIVE); } -/* Insert a PTE, top-level function is out of line. It uses an inline - * low level function in the respective pgtable-* files - */ -extern void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte); - /* This low level function performs the actual PTE insertion * Setting the PTE depends on the MMU type and other factors. It's * an horrible mess that I'm not going to try to clean up now but @@ -282,10 +274,12 @@ static inline int pud_huge(pud_t pud) * for the page which has just been mapped in. */ #if defined(CONFIG_PPC_E500) && defined(CONFIG_HUGETLB_PAGE) -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); #else -static inline -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) {} +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) {} #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 33464e6d64315a..b2e9bc4a52c11b 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -41,6 +41,12 @@ struct mm_struct; #ifndef __ASSEMBLY__ +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr); +#define set_ptes set_ptes +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) + #ifndef MAX_PTRS_PER_PGD #define MAX_PTRS_PER_PGD PTRS_PER_PGD #endif @@ -48,6 +54,12 @@ struct mm_struct; /* Keep these as a macros to avoid include dependency mess */ #define pte_page(x) pfn_to_page(pte_pfn(x)) #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) + +static inline unsigned long pte_pfn(pte_t pte) +{ + return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT; +} + /* * Select all bits except the pfn */ diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index fedffe3ae13652..ad2afa08e62ed9 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1307,18 +1307,19 @@ void hash__early_init_mmu_secondary(void) */ unsigned int hash_page_do_lazy_icache(unsigned int pp, pte_t pte, int trap) { - struct page *page; + struct folio *folio; if (!pfn_valid(pte_pfn(pte))) return pp; - page = pte_page(pte); + folio = page_folio(pte_page(pte)); /* page is dirty */ - if (!test_bit(PG_dcache_clean, &page->flags) && !PageReserved(page)) { + if (!test_bit(PG_dcache_clean, &folio->flags) && + !folio_test_reserved(folio)) { if (trap == INTERRUPT_INST_STORAGE) { - flush_dcache_icache_page(page); - set_bit(PG_dcache_clean, &page->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } else pp |= HPTE_R_N; } diff --git a/arch/powerpc/mm/cacheflush.c b/arch/powerpc/mm/cacheflush.c index 0e9b4879c0f9ea..15189592da09e1 100644 --- a/arch/powerpc/mm/cacheflush.c +++ b/arch/powerpc/mm/cacheflush.c @@ -148,44 +148,31 @@ static void __flush_dcache_icache(void *p) invalidate_icache_range(addr, addr + PAGE_SIZE); } -static void flush_dcache_icache_hugepage(struct page *page) +void flush_dcache_icache_folio(struct folio *folio) { - int i; - int nr = compound_nr(page); + unsigned int i, nr = folio_nr_pages(folio); - if (!PageHighMem(page)) { + if (flush_coherent_icache()) + return; + + if (!folio_test_highmem(folio)) { + void *addr = folio_address(folio); for (i = 0; i < nr; i++) - __flush_dcache_icache(lowmem_page_address(page + i)); - } else { + __flush_dcache_icache(addr + i * PAGE_SIZE); + } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { for (i = 0; i < nr; i++) { - void *start = kmap_local_page(page + i); + void *start = kmap_local_folio(folio, i * PAGE_SIZE); __flush_dcache_icache(start); kunmap_local(start); } - } -} - -void flush_dcache_icache_page(struct page *page) -{ - if (flush_coherent_icache()) - return; - - if (PageCompound(page)) - return flush_dcache_icache_hugepage(page); - - if (!PageHighMem(page)) { - __flush_dcache_icache(lowmem_page_address(page)); - } else if (IS_ENABLED(CONFIG_BOOKE) || sizeof(phys_addr_t) > sizeof(void *)) { - void *start = kmap_local_page(page); - - __flush_dcache_icache(start); - kunmap_local(start); } else { - flush_dcache_icache_phys(page_to_phys(page)); + unsigned long pfn = folio_pfn(folio); + for (i = 0; i < nr; i++) + flush_dcache_icache_phys((pfn + i) * PAGE_SIZE); } } -EXPORT_SYMBOL(flush_dcache_icache_page); +EXPORT_SYMBOL(flush_dcache_icache_folio); void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index 58c8d9849cb116..6b30e40d45903b 100644 --- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -178,7 +178,8 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) * * This must always be called with the pte lock held. */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index a3dcdb2d5b4b6a..3f86fd217690be 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -58,7 +58,7 @@ static inline int pte_looks_normal(pte_t pte) return 0; } -static struct page *maybe_pte_to_page(pte_t pte) +static struct folio *maybe_pte_to_folio(pte_t pte) { unsigned long pfn = pte_pfn(pte); struct page *page; @@ -68,7 +68,7 @@ static struct page *maybe_pte_to_page(pte_t pte) page = pfn_to_page(pfn); if (PageReserved(page)) return NULL; - return page; + return page_folio(page); } #ifdef CONFIG_PPC_BOOK3S @@ -84,12 +84,12 @@ static pte_t set_pte_filter_hash(pte_t pte) pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || cpu_has_feature(CPU_FTR_NOEXECUTE))) { - struct page *pg = maybe_pte_to_page(pte); - if (!pg) + struct folio *folio = maybe_pte_to_folio(pte); + if (!folio) return pte; - if (!test_bit(PG_dcache_clean, &pg->flags)) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + if (!test_bit(PG_dcache_clean, &folio->flags)) { + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); } } return pte; @@ -107,7 +107,7 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; } */ static inline pte_t set_pte_filter(pte_t pte) { - struct page *pg; + struct folio *folio; if (radix_enabled()) return pte; @@ -120,18 +120,18 @@ static inline pte_t set_pte_filter(pte_t pte) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) return pte; /* If the page clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) return pte; /* If it's an exec fault, we flush the cache and make it clean */ if (is_exec_fault()) { - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); return pte; } @@ -142,7 +142,7 @@ static inline pte_t set_pte_filter(pte_t pte) static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, int dirty) { - struct page *pg; + struct folio *folio; if (IS_ENABLED(CONFIG_PPC_BOOK3S_64)) return pte; @@ -168,17 +168,17 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, #endif /* CONFIG_DEBUG_VM */ /* If you set _PAGE_EXEC on weird pages you're on your own */ - pg = maybe_pte_to_page(pte); - if (unlikely(!pg)) + folio = maybe_pte_to_folio(pte); + if (unlikely(!folio)) goto bail; /* If the page is already clean, we move on */ - if (test_bit(PG_dcache_clean, &pg->flags)) + if (test_bit(PG_dcache_clean, &folio->flags)) goto bail; /* Clean the page and set PG_dcache_clean */ - flush_dcache_icache_page(pg); - set_bit(PG_dcache_clean, &pg->flags); + flush_dcache_icache_folio(folio); + set_bit(PG_dcache_clean, &folio->flags); bail: return pte_mkexec(pte); @@ -187,8 +187,8 @@ static pte_t set_access_flags_filter(pte_t pte, struct vm_area_struct *vma, /* * set_pte stores a linux PTE into the linux page table. */ -void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, - pte_t pte) +void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, + pte_t pte, unsigned int nr) { /* * Make sure hardware valid bit is not set. We don't do @@ -203,7 +203,16 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte = set_pte_filter(pte); /* Perform the setting of the PTE */ - __set_pte_at(mm, addr, ptep, pte, 0); + arch_enter_lazy_mmu_mode(); + for (;;) { + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + pte = __pte(pte_val(pte) + (1UL << PTE_RPN_SHIFT)); + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); } void unmap_kernel_page(unsigned long va) From 864609c6a0b5f0464f6ec7869cb2a45a529c35d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:50 +0100 Subject: [PATCH 451/489] riscv: implement the new page table range API Add set_ptes(), update_mmu_cache_range() and flush_dcache_folio(). Change the PG_dcache_clean flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Alexandre Ghiti Acked-by: Mike Rapoport (IBM) Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Signed-off-by: Andrew Morton --- arch/riscv/include/asm/cacheflush.h | 19 +++++++-------- arch/riscv/include/asm/pgtable.h | 37 +++++++++++++++++++---------- arch/riscv/mm/cacheflush.c | 13 +++------- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8091b8bf4883fe..0d8c92c5dfb7ef 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -15,20 +15,19 @@ static inline void local_flush_icache_all(void) #define PG_dcache_clean PG_arch_1 -static inline void flush_dcache_page(struct page *page) +static inline void flush_dcache_folio(struct folio *folio) { - /* - * HugeTLB pages are always fully mapped and only head page will be - * set PG_dcache_clean (see comments in flush_icache_pte()). - */ - if (PageHuge(page)) - page = compound_head(page); - - if (test_bit(PG_dcache_clean, &page->flags)) - clear_bit(PG_dcache_clean, &page->flags); + if (test_bit(PG_dcache_clean, &folio->flags)) + clear_bit(PG_dcache_clean, &folio->flags); } +#define flush_dcache_folio flush_dcache_folio #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} + /* * RISC-V doesn't have an instruction to flush parts of the instruction cache, * so instead we just flush the whole thing. diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 01e4aabc88984e..ac42e9121e52e5 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -445,8 +445,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) /* Commit new configuration to MMU hardware */ -static inline void update_mmu_cache(struct vm_area_struct *vma, - unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { /* * The kernel assumes that TLBs don't cache invalid entries, but @@ -455,8 +456,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, * Relying on flush_tlb_fix_spurious_fault would suffice, but * the extra traps reduce performance. So, eagerly SFENCE.VMA. */ - local_flush_tlb_page(address); + while (nr--) + local_flush_tlb_page(address + nr * PAGE_SIZE); } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #define __HAVE_ARCH_UPDATE_MMU_TLB #define update_mmu_tlb update_mmu_cache @@ -487,8 +491,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) void flush_icache_pte(pte_t pte); -static inline void __set_pte_at(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, pte_t pteval) +static inline void __set_pte_at(pte_t *ptep, pte_t pteval) { if (pte_present(pteval) && pte_exec(pteval)) flush_icache_pte(pteval); @@ -496,17 +499,25 @@ static inline void __set_pte_at(struct mm_struct *mm, set_pte(ptep, pteval); } -static inline void set_pte_at(struct mm_struct *mm, - unsigned long addr, pte_t *ptep, pte_t pteval) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval, unsigned int nr) { - page_table_check_ptes_set(mm, ptep, pteval, 1); - __set_pte_at(mm, addr, ptep, pteval); + page_table_check_ptes_set(mm, ptep, pteval, nr); + + for (;;) { + __set_pte_at(ptep, pteval); + if (--nr == 0) + break; + ptep++; + pte_val(pteval) += 1 << _PAGE_PFN_SHIFT; + } } +#define set_ptes set_ptes static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - __set_pte_at(mm, addr, ptep, __pte(0)); + __set_pte_at(ptep, __pte(0)); } #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS @@ -515,7 +526,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { if (!pte_same(*ptep, entry)) - set_pte_at(vma->vm_mm, address, ptep, entry); + __set_pte_at(ptep, entry); /* * update_mmu_cache will unconditionally execute, handling both * the case that the PTE changed and the spurious fault case. @@ -688,14 +699,14 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { page_table_check_pmd_set(mm, pmdp, pmd); - return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); + return __set_pte_at((pte_t *)pmdp, pmd_pte(pmd)); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { page_table_check_pud_set(mm, pudp, pud); - return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); + return __set_pte_at((pte_t *)pudp, pud_pte(pud)); } #ifdef CONFIG_PAGE_TABLE_CHECK diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index fbc59b3f69f24e..f1387272a5512d 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -82,18 +82,11 @@ void flush_icache_mm(struct mm_struct *mm, bool local) #ifdef CONFIG_MMU void flush_icache_pte(pte_t pte) { - struct page *page = pte_page(pte); + struct folio *folio = page_folio(pte_page(pte)); - /* - * HugeTLB pages are always fully mapped, so only setting head page's - * PG_dcache_clean flag is enough. - */ - if (PageHuge(page)) - page = compound_head(page); - - if (!test_bit(PG_dcache_clean, &page->flags)) { + if (!test_bit(PG_dcache_clean, &folio->flags)) { flush_icache_all(); - set_bit(PG_dcache_clean, &page->flags); + set_bit(PG_dcache_clean, &folio->flags); } } #endif /* CONFIG_MMU */ From 843f9310e00ad4d6207cff88c05ce90b857625d0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:51 +0100 Subject: [PATCH 452/489] s390: implement the new page table range API Add set_ptes() and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Gerald Schaefer Acked-by: Mike Rapoport (IBM) Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Signed-off-by: Andrew Morton --- arch/s390/include/asm/pgtable.h | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index c55f3c3365af8e..02973c740a5be7 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -47,6 +47,7 @@ static inline void update_page_count(int level, long count) * tables contain all the necessary information. */ #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, addr, ptep, nr) do { } while (0) #define update_mmu_cache_pmd(vma, address, ptep) do { } while (0) /* @@ -1316,20 +1317,34 @@ pgprot_t pgprot_writecombine(pgprot_t prot); pgprot_t pgprot_writethrough(pgprot_t prot); /* - * Certain architectures need to do special things when PTEs - * within a page table are directly modified. Thus, the following - * hook is made available. + * Set multiple PTEs to consecutive pages with a single call. All PTEs + * are within the same folio, PMD and VMA. */ -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t entry) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t entry, unsigned int nr) { if (pte_present(entry)) entry = clear_pte_bit(entry, __pgprot(_PAGE_UNUSED)); - if (mm_has_pgste(mm)) - ptep_set_pte_at(mm, addr, ptep, entry); - else - set_pte(ptep, entry); + if (mm_has_pgste(mm)) { + for (;;) { + ptep_set_pte_at(mm, addr, ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + addr += PAGE_SIZE; + } + } else { + for (;;) { + set_pte(ptep, entry); + if (--nr == 0) + break; + ptep++; + entry = __pte(pte_val(entry) + PAGE_SIZE); + } + } } +#define set_ptes set_ptes /* * Conversion functions: convert a page and protection to a page entry, From 157efa290441bf7eb952f81717704afef09ae0d6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:52 +0100 Subject: [PATCH 453/489] sh: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Change the PG_dcache_clean flag from being per-page to per-folio. Flush the entire folio containing the pages in flush_icache_pages() for ease of implementation. Link: https://lkml.kernel.org/r/20230802151406.3735276-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Yoshinori Sato Cc: Rich Felker Cc: John Paul Adrian Glaubitz Signed-off-by: Andrew Morton --- arch/sh/include/asm/cacheflush.h | 21 ++++++++----- arch/sh/include/asm/pgtable.h | 7 +++-- arch/sh/include/asm/pgtable_32.h | 5 ++- arch/sh/mm/cache-j2.c | 4 +-- arch/sh/mm/cache-sh4.c | 26 +++++++++++----- arch/sh/mm/cache-sh7705.c | 26 ++++++++++------ arch/sh/mm/cache.c | 52 ++++++++++++++++++-------------- arch/sh/mm/kmap.c | 3 +- 8 files changed, 89 insertions(+), 55 deletions(-) diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 481a664287e2e4..9fceef6f3e002b 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -13,9 +13,9 @@ * - flush_cache_page(mm, vmaddr, pfn) flushes a single page * - flush_cache_range(vma, start, end) flushes a range of pages * - * - flush_dcache_page(pg) flushes(wback&invalidates) a page for dcache + * - flush_dcache_folio(folio) flushes(wback&invalidates) a folio for dcache * - flush_icache_range(start, end) flushes(invalidates) a range for icache - * - flush_icache_page(vma, pg) flushes(invalidates) a page for icache + * - flush_icache_pages(vma, pg, nr) flushes(invalidates) pages for icache * - flush_cache_sigtramp(vaddr) flushes the signal trampoline */ extern void (*local_flush_cache_all)(void *args); @@ -23,9 +23,9 @@ extern void (*local_flush_cache_mm)(void *args); extern void (*local_flush_cache_dup_mm)(void *args); extern void (*local_flush_cache_page)(void *args); extern void (*local_flush_cache_range)(void *args); -extern void (*local_flush_dcache_page)(void *args); +extern void (*local_flush_dcache_folio)(void *args); extern void (*local_flush_icache_range)(void *args); -extern void (*local_flush_icache_page)(void *args); +extern void (*local_flush_icache_folio)(void *args); extern void (*local_flush_cache_sigtramp)(void *args); static inline void cache_noop(void *args) { } @@ -42,11 +42,18 @@ extern void flush_cache_page(struct vm_area_struct *vma, extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} + extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_user_range flush_icache_range -extern void flush_icache_page(struct vm_area_struct *vma, - struct page *page); +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr); +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void flush_cache_sigtramp(unsigned long address); struct flusher_data { diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 3ce30becf6dfa9..729f5c6225fbb4 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -102,13 +102,16 @@ extern void __update_cache(struct vm_area_struct *vma, extern void __update_tlb(struct vm_area_struct *vma, unsigned long address, pte_t pte); -static inline void -update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) { pte_t pte = *ptep; __update_cache(vma, address, pte); __update_tlb(vma, address, pte); } +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern void paging_init(void); diff --git a/arch/sh/include/asm/pgtable_32.h b/arch/sh/include/asm/pgtable_32.h index 21952b0946509f..676f3d4ef6ceac 100644 --- a/arch/sh/include/asm/pgtable_32.h +++ b/arch/sh/include/asm/pgtable_32.h @@ -307,14 +307,13 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define set_pte(pteptr, pteval) (*(pteptr) = pteval) #endif -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - /* * (pmds are folded into pgds so this doesn't get actually called, * but the define is needed for a generic inline function.) */ #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) +#define PFN_PTE_SHIFT PAGE_SHIFT #define pfn_pte(pfn, prot) \ __pte(((unsigned long long)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) \ @@ -323,7 +322,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte) #define pte_none(x) (!pte_val(x)) #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) +#define pte_clear(mm, addr, ptep) set_pte(ptep, __pte(0)) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x)) diff --git a/arch/sh/mm/cache-j2.c b/arch/sh/mm/cache-j2.c index f277862a11f530..9ac96021438086 100644 --- a/arch/sh/mm/cache-j2.c +++ b/arch/sh/mm/cache-j2.c @@ -55,9 +55,9 @@ void __init j2_cache_init(void) local_flush_cache_dup_mm = j2_flush_both; local_flush_cache_page = j2_flush_both; local_flush_cache_range = j2_flush_both; - local_flush_dcache_page = j2_flush_dcache; + local_flush_dcache_folio = j2_flush_dcache; local_flush_icache_range = j2_flush_icache; - local_flush_icache_page = j2_flush_icache; + local_flush_icache_folio = j2_flush_icache; local_flush_cache_sigtramp = j2_flush_icache; pr_info("Initial J2 CCR is %.8x\n", __raw_readl(j2_ccr_base)); diff --git a/arch/sh/mm/cache-sh4.c b/arch/sh/mm/cache-sh4.c index 72c2e1b46c0838..862046f26981b6 100644 --- a/arch/sh/mm/cache-sh4.c +++ b/arch/sh/mm/cache-sh4.c @@ -107,19 +107,29 @@ static inline void flush_cache_one(unsigned long start, unsigned long phys) * Write back & invalidate the D-cache of the page. * (To avoid "alias" issues) */ -static void sh4_flush_dcache_page(void *arg) +static void sh4_flush_dcache_folio(void *arg) { - struct page *page = arg; - unsigned long addr = (unsigned long)page_address(page); + struct folio *folio = arg; #ifndef CONFIG_SMP - struct address_space *mapping = page_mapping_file(page); + struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); else #endif - flush_cache_one(CACHE_OC_ADDRESS_ARRAY | - (addr & shm_align_mask), page_to_phys(page)); + { + unsigned long pfn = folio_pfn(folio); + unsigned long addr = (unsigned long)folio_address(folio); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + flush_cache_one(CACHE_OC_ADDRESS_ARRAY | + (addr & shm_align_mask), + pfn * PAGE_SIZE); + addr += PAGE_SIZE; + pfn++; + } + } wmb(); } @@ -379,7 +389,7 @@ void __init sh4_cache_init(void) __raw_readl(CCN_PRR)); local_flush_icache_range = sh4_flush_icache_range; - local_flush_dcache_page = sh4_flush_dcache_page; + local_flush_dcache_folio = sh4_flush_dcache_folio; local_flush_cache_all = sh4_flush_cache_all; local_flush_cache_mm = sh4_flush_cache_mm; local_flush_cache_dup_mm = sh4_flush_cache_mm; diff --git a/arch/sh/mm/cache-sh7705.c b/arch/sh/mm/cache-sh7705.c index 9b63a53a5e46fe..b509a407588fa1 100644 --- a/arch/sh/mm/cache-sh7705.c +++ b/arch/sh/mm/cache-sh7705.c @@ -132,15 +132,20 @@ static void __flush_dcache_page(unsigned long phys) * Write back & invalidate the D-cache of the page. * (To avoid "alias" issues) */ -static void sh7705_flush_dcache_page(void *arg) +static void sh7705_flush_dcache_folio(void *arg) { - struct page *page = arg; - struct address_space *mapping = page_mapping_file(page); + struct folio *folio = arg; + struct address_space *mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) - clear_bit(PG_dcache_clean, &page->flags); - else - __flush_dcache_page(__pa(page_address(page))); + clear_bit(PG_dcache_clean, &folio->flags); + else { + unsigned long pfn = folio_pfn(folio); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + __flush_dcache_page((pfn + i) * PAGE_SIZE); + } } static void sh7705_flush_cache_all(void *args) @@ -176,19 +181,20 @@ static void sh7705_flush_cache_page(void *args) * Not entirely sure why this is necessary on SH3 with 32K cache but * without it we get occasional "Memory fault" when loading a program. */ -static void sh7705_flush_icache_page(void *page) +static void sh7705_flush_icache_folio(void *arg) { - __flush_purge_region(page_address(page), PAGE_SIZE); + struct folio *folio = arg; + __flush_purge_region(folio_address(folio), folio_size(folio)); } void __init sh7705_cache_init(void) { local_flush_icache_range = sh7705_flush_icache_range; - local_flush_dcache_page = sh7705_flush_dcache_page; + local_flush_dcache_folio = sh7705_flush_dcache_folio; local_flush_cache_all = sh7705_flush_cache_all; local_flush_cache_mm = sh7705_flush_cache_all; local_flush_cache_dup_mm = sh7705_flush_cache_all; local_flush_cache_range = sh7705_flush_cache_all; local_flush_cache_page = sh7705_flush_cache_page; - local_flush_icache_page = sh7705_flush_icache_page; + local_flush_icache_folio = sh7705_flush_icache_folio; } diff --git a/arch/sh/mm/cache.c b/arch/sh/mm/cache.c index 3aef78ceb82090..9bcaa5619eabd1 100644 --- a/arch/sh/mm/cache.c +++ b/arch/sh/mm/cache.c @@ -20,9 +20,9 @@ void (*local_flush_cache_mm)(void *args) = cache_noop; void (*local_flush_cache_dup_mm)(void *args) = cache_noop; void (*local_flush_cache_page)(void *args) = cache_noop; void (*local_flush_cache_range)(void *args) = cache_noop; -void (*local_flush_dcache_page)(void *args) = cache_noop; +void (*local_flush_dcache_folio)(void *args) = cache_noop; void (*local_flush_icache_range)(void *args) = cache_noop; -void (*local_flush_icache_page)(void *args) = cache_noop; +void (*local_flush_icache_folio)(void *args) = cache_noop; void (*local_flush_cache_sigtramp)(void *args) = cache_noop; void (*__flush_wback_region)(void *start, int size); @@ -61,15 +61,17 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { - if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && - test_bit(PG_dcache_clean, &page->flags)) { + struct folio *folio = page_folio(page); + + if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && + test_bit(PG_dcache_clean, &folio->flags)) { void *vto = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(vto, src, len); kunmap_coherent(vto); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); } if (vma->vm_flags & VM_EXEC) @@ -80,27 +82,30 @@ void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, unsigned long len) { + struct folio *folio = page_folio(page); + if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && - test_bit(PG_dcache_clean, &page->flags)) { + test_bit(PG_dcache_clean, &folio->flags)) { void *vfrom = kmap_coherent(page, vaddr) + (vaddr & ~PAGE_MASK); memcpy(dst, vfrom, len); kunmap_coherent(vfrom); } else { memcpy(dst, src, len); if (boot_cpu_data.dcache.n_aliases) - clear_bit(PG_dcache_clean, &page->flags); + clear_bit(PG_dcache_clean, &folio->flags); } } void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { + struct folio *src = page_folio(from); void *vfrom, *vto; vto = kmap_atomic(to); - if (boot_cpu_data.dcache.n_aliases && page_mapcount(from) && - test_bit(PG_dcache_clean, &from->flags)) { + if (boot_cpu_data.dcache.n_aliases && folio_mapped(src) && + test_bit(PG_dcache_clean, &src->flags)) { vfrom = kmap_coherent(from, vaddr); copy_page(vto, vfrom); kunmap_coherent(vfrom); @@ -136,27 +141,28 @@ EXPORT_SYMBOL(clear_user_highpage); void __update_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte) { - struct page *page; unsigned long pfn = pte_pfn(pte); if (!boot_cpu_data.dcache.n_aliases) return; - page = pfn_to_page(pfn); if (pfn_valid(pfn)) { - int dirty = !test_and_set_bit(PG_dcache_clean, &page->flags); + struct folio *folio = page_folio(pfn_to_page(pfn)); + int dirty = !test_and_set_bit(PG_dcache_clean, &folio->flags); if (dirty) - __flush_purge_region(page_address(page), PAGE_SIZE); + __flush_purge_region(folio_address(folio), + folio_size(folio)); } } void __flush_anon_page(struct page *page, unsigned long vmaddr) { + struct folio *folio = page_folio(page); unsigned long addr = (unsigned long) page_address(page); if (pages_do_alias(addr, vmaddr)) { - if (boot_cpu_data.dcache.n_aliases && page_mapcount(page) && - test_bit(PG_dcache_clean, &page->flags)) { + if (boot_cpu_data.dcache.n_aliases && folio_mapped(folio) && + test_bit(PG_dcache_clean, &folio->flags)) { void *kaddr; kaddr = kmap_coherent(page, vmaddr); @@ -164,7 +170,8 @@ void __flush_anon_page(struct page *page, unsigned long vmaddr) /* __flush_purge_region((void *)kaddr, PAGE_SIZE); */ kunmap_coherent(kaddr); } else - __flush_purge_region((void *)addr, PAGE_SIZE); + __flush_purge_region(folio_address(folio), + folio_size(folio)); } } @@ -215,11 +222,11 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, } EXPORT_SYMBOL(flush_cache_range); -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - cacheop_on_each_cpu(local_flush_dcache_page, page, 1); + cacheop_on_each_cpu(local_flush_dcache_folio, folio, 1); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); void flush_icache_range(unsigned long start, unsigned long end) { @@ -233,10 +240,11 @@ void flush_icache_range(unsigned long start, unsigned long end) } EXPORT_SYMBOL(flush_icache_range); -void flush_icache_page(struct vm_area_struct *vma, struct page *page) +void flush_icache_pages(struct vm_area_struct *vma, struct page *page, + unsigned int nr) { - /* Nothing uses the VMA, so just pass the struct page along */ - cacheop_on_each_cpu(local_flush_icache_page, page, 1); + /* Nothing uses the VMA, so just pass the folio along */ + cacheop_on_each_cpu(local_flush_icache_folio, page_folio(page), 1); } void flush_cache_sigtramp(unsigned long address) diff --git a/arch/sh/mm/kmap.c b/arch/sh/mm/kmap.c index 73fd7cc9943076..fa50e8f6e7a91b 100644 --- a/arch/sh/mm/kmap.c +++ b/arch/sh/mm/kmap.c @@ -27,10 +27,11 @@ void __init kmap_coherent_init(void) void *kmap_coherent(struct page *page, unsigned long addr) { + struct folio *folio = page_folio(page); enum fixed_addresses idx; unsigned long vaddr; - BUG_ON(!test_bit(PG_dcache_clean, &page->flags)); + BUG_ON(!test_bit(PG_dcache_clean, &folio->flags)); preempt_disable(); pagefault_disable(); From 665f640294540a941aabb81ae46dfc671aff5259 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:53 +0100 Subject: [PATCH 454/489] sparc32: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Link: https://lkml.kernel.org/r/20230802151406.3735276-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/cacheflush_32.h | 10 ++++++++-- arch/sparc/include/asm/pgtable_32.h | 8 ++++---- arch/sparc/mm/init_32.c | 13 +++++++++++-- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index adb6991d04554c..c8dd971f0e881b 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -2,6 +2,7 @@ #ifndef _SPARC_CACHEFLUSH_H #define _SPARC_CACHEFLUSH_H +#include #include #define flush_cache_all() \ @@ -16,6 +17,7 @@ sparc32_cachetlb_ops->cache_page(vma, addr) #define flush_icache_range(start, end) do { } while (0) #define flush_icache_page(vma, pg) do { } while (0) +#define flush_icache_pages(vma, pg, nr) do { } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ @@ -35,11 +37,15 @@ #define flush_page_for_dma(addr) \ sparc32_cachetlb_ops->page_for_dma(addr) -struct page; void sparc_flush_page_to_ram(struct page *page); +void sparc_flush_folio_to_ram(struct folio *folio); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -#define flush_dcache_page(page) sparc_flush_page_to_ram(page) +#define flush_dcache_folio(folio) sparc_flush_folio_to_ram(folio) +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index d4330e3c57a6e5..315d316614cae8 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -101,8 +101,6 @@ static inline void set_pte(pte_t *ptep, pte_t pteval) srmmu_swap((unsigned long *)ptep, pte_val(pteval)); } -#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval) - static inline int srmmu_device_memory(unsigned long x) { return ((x & 0xF0000000) != 0); @@ -256,6 +254,7 @@ static inline pte_t pte_mkyoung(pte_t pte) return __pte(pte_val(pte) | SRMMU_REF); } +#define PFN_PTE_SHIFT (PAGE_SHIFT - 4) #define pfn_pte(pfn, prot) mk_pte(pfn_to_page(pfn), prot) static inline unsigned long pte_pfn(pte_t pte) @@ -268,7 +267,7 @@ static inline unsigned long pte_pfn(pte_t pte) */ return ~0UL; } - return (pte_val(pte) & SRMMU_PTE_PMASK) >> (PAGE_SHIFT-4); + return (pte_val(pte) & SRMMU_PTE_PMASK) >> PFN_PTE_SHIFT; } #define pte_page(pte) pfn_to_page(pte_pfn(pte)) @@ -318,6 +317,7 @@ void mmu_info(struct seq_file *m); #define FAULT_CODE_USER 0x4 #define update_mmu_cache(vma, address, ptep) do { } while (0) +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do { } while (0) void srmmu_mapiorange(unsigned int bus, unsigned long xpa, unsigned long xva, unsigned int len); @@ -422,7 +422,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, ({ \ int __changed = !pte_same(*(__ptep), __entry); \ if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ + set_pte(__ptep, __entry); \ flush_tlb_page(__vma, __address); \ } \ __changed; \ diff --git a/arch/sparc/mm/init_32.c b/arch/sparc/mm/init_32.c index 9c0ea457bdf055..d96a14ffceebf5 100644 --- a/arch/sparc/mm/init_32.c +++ b/arch/sparc/mm/init_32.c @@ -297,11 +297,20 @@ void sparc_flush_page_to_ram(struct page *page) { unsigned long vaddr = (unsigned long)page_address(page); - if (vaddr) - __flush_page_to_ram(vaddr); + __flush_page_to_ram(vaddr); } EXPORT_SYMBOL(sparc_flush_page_to_ram); +void sparc_flush_folio_to_ram(struct folio *folio) +{ + unsigned long vaddr = (unsigned long)folio_address(folio); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) + __flush_page_to_ram(vaddr + i * PAGE_SIZE); +} +EXPORT_SYMBOL(sparc_flush_folio_to_ram); + static const pgprot_t protection_map[16] = { [VM_NONE] = PAGE_NONE, [VM_READ] = PAGE_READONLY, From 1a10a44dfc1d55ba84987da1f8377258a044499c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:54 +0100 Subject: [PATCH 455/489] sparc64: implement the new page table range API Add set_ptes(), update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Convert the PG_dcache_dirty flag from being per-page to per-folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: "David S. Miller" Signed-off-by: Andrew Morton --- arch/sparc/include/asm/cacheflush_64.h | 18 ++++-- arch/sparc/include/asm/pgtable_64.h | 29 +++++++--- arch/sparc/kernel/smp_64.c | 56 +++++++++++------- arch/sparc/mm/init_64.c | 78 +++++++++++++++----------- arch/sparc/mm/tlb.c | 5 +- 5 files changed, 119 insertions(+), 67 deletions(-) diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index b9341836597ec5..a9a719f04d06b1 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -35,20 +35,26 @@ void flush_icache_range(unsigned long start, unsigned long end); void __flush_icache_page(unsigned long); void __flush_dcache_page(void *addr, int flush_icache); -void flush_dcache_page_impl(struct page *page); +void flush_dcache_folio_impl(struct folio *folio); #ifdef CONFIG_SMP -void smp_flush_dcache_page_impl(struct page *page, int cpu); -void flush_dcache_page_all(struct mm_struct *mm, struct page *page); +void smp_flush_dcache_folio_impl(struct folio *folio, int cpu); +void flush_dcache_folio_all(struct mm_struct *mm, struct folio *folio); #else -#define smp_flush_dcache_page_impl(page,cpu) flush_dcache_page_impl(page) -#define flush_dcache_page_all(mm,page) flush_dcache_page_impl(page) +#define smp_flush_dcache_folio_impl(folio, cpu) flush_dcache_folio_impl(folio) +#define flush_dcache_folio_all(mm, folio) flush_dcache_folio_impl(folio) #endif void __flush_dcache_range(unsigned long start, unsigned long end); #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *page); +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} #define flush_icache_page(vma, pg) do { } while(0) +#define flush_icache_pages(vma, pg, nr) do { } while(0) void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 5563efa1a19f94..09aa37cc44690c 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -86,6 +86,7 @@ extern unsigned long VMALLOC_END; #define vmemmap ((struct page *)VMEMMAP_BASE) #include +#include bool kern_addr_valid(unsigned long addr); @@ -927,8 +928,21 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT); } -#define set_pte_at(mm,addr,ptep,pte) \ - __set_pte_at((mm), (addr), (ptep), (pte), 0) +static inline void set_ptes(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned int nr) +{ + arch_enter_lazy_mmu_mode(); + for (;;) { + __set_pte_at(mm, addr, ptep, pte, 0); + if (--nr == 0) + break; + ptep++; + pte_val(pte) += PAGE_SIZE; + addr += PAGE_SIZE; + } + arch_leave_lazy_mmu_mode(); +} +#define set_ptes set_ptes #define pte_clear(mm,addr,ptep) \ set_pte_at((mm), (addr), (ptep), __pte(0UL)) @@ -947,8 +961,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, \ if (pfn_valid(this_pfn) && \ (((old_addr) ^ (new_addr)) & (1 << 13))) \ - flush_dcache_page_all(current->mm, \ - pfn_to_page(this_pfn)); \ + flush_dcache_folio_all(current->mm, \ + page_folio(pfn_to_page(this_pfn))); \ } \ newpte; \ }) @@ -963,7 +977,10 @@ struct seq_file; void mmu_info(struct seq_file *); struct vm_area_struct; -void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t *); +void update_mmu_cache_range(struct vm_fault *, struct vm_area_struct *, + unsigned long addr, pte_t *ptep, unsigned int nr); +#define update_mmu_cache(vma, addr, ptep) \ + update_mmu_cache_range(NULL, vma, addr, ptep, 1) #ifdef CONFIG_TRANSPARENT_HUGEPAGE void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); @@ -1121,8 +1138,6 @@ static inline bool pte_access_permitted(pte_t pte, bool write) } #define pte_access_permitted pte_access_permitted -#include - /* We provide our own get_unmapped_area to cope with VA holes and * SHM area cache aliasing for userland. */ diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index e5964d1d8b37dc..f3969a3600dbfe 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -921,20 +921,26 @@ extern unsigned long xcall_flush_dcache_page_cheetah; #endif extern unsigned long xcall_flush_dcache_page_spitfire; -static inline void __local_flush_dcache_page(struct page *page) +static inline void __local_flush_dcache_folio(struct folio *folio) { + unsigned int i, nr = folio_nr_pages(folio); + #ifdef DCACHE_ALIASING_POSSIBLE - __flush_dcache_page(page_address(page), + for (i = 0; i < nr; i++) + __flush_dcache_page(folio_address(folio) + i * PAGE_SIZE, ((tlb_type == spitfire) && - page_mapping_file(page) != NULL)); + folio_flush_mapping(folio) != NULL)); #else - if (page_mapping_file(page) != NULL && - tlb_type == spitfire) - __flush_icache_page(__pa(page_address(page))); + if (folio_flush_mapping(folio) != NULL && + tlb_type == spitfire) { + unsigned long pfn = folio_pfn(folio) + for (i = 0; i < nr; i++) + __flush_icache_page((pfn + i) * PAGE_SIZE); + } #endif } -void smp_flush_dcache_page_impl(struct page *page, int cpu) +void smp_flush_dcache_folio_impl(struct folio *folio, int cpu) { int this_cpu; @@ -948,14 +954,14 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) this_cpu = get_cpu(); if (cpu == this_cpu) { - __local_flush_dcache_page(page); + __local_flush_dcache_folio(folio); } else if (cpu_online(cpu)) { - void *pg_addr = page_address(page); + void *pg_addr = folio_address(folio); u64 data0 = 0; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page_mapping_file(page) != NULL) + if (folio_flush_mapping(folio) != NULL) data0 |= ((u64)1 << 32); } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { #ifdef DCACHE_ALIASING_POSSIBLE @@ -963,18 +969,23 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu) #endif } if (data0) { - xcall_deliver(data0, __pa(pg_addr), - (u64) pg_addr, cpumask_of(cpu)); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + xcall_deliver(data0, __pa(pg_addr), + (u64) pg_addr, cpumask_of(cpu)); #ifdef CONFIG_DEBUG_DCFLUSH - atomic_inc(&dcpage_flushes_xcall); + atomic_inc(&dcpage_flushes_xcall); #endif + pg_addr += PAGE_SIZE; + } } } put_cpu(); } -void flush_dcache_page_all(struct mm_struct *mm, struct page *page) +void flush_dcache_folio_all(struct mm_struct *mm, struct folio *folio) { void *pg_addr; u64 data0; @@ -988,10 +999,10 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) atomic_inc(&dcpage_flushes); #endif data0 = 0; - pg_addr = page_address(page); + pg_addr = folio_address(folio); if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page_mapping_file(page) != NULL) + if (folio_flush_mapping(folio) != NULL) data0 |= ((u64)1 << 32); } else if (tlb_type == cheetah || tlb_type == cheetah_plus) { #ifdef DCACHE_ALIASING_POSSIBLE @@ -999,13 +1010,18 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page) #endif } if (data0) { - xcall_deliver(data0, __pa(pg_addr), - (u64) pg_addr, cpu_online_mask); + unsigned int i, nr = folio_nr_pages(folio); + + for (i = 0; i < nr; i++) { + xcall_deliver(data0, __pa(pg_addr), + (u64) pg_addr, cpu_online_mask); #ifdef CONFIG_DEBUG_DCFLUSH - atomic_inc(&dcpage_flushes_xcall); + atomic_inc(&dcpage_flushes_xcall); #endif + pg_addr += PAGE_SIZE; + } } - __local_flush_dcache_page(page); + __local_flush_dcache_folio(folio); preempt_enable(); } diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 9a63a3e08e40c1..f83017992eaaeb 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -195,21 +195,26 @@ atomic_t dcpage_flushes_xcall = ATOMIC_INIT(0); #endif #endif -inline void flush_dcache_page_impl(struct page *page) +inline void flush_dcache_folio_impl(struct folio *folio) { + unsigned int i, nr = folio_nr_pages(folio); + BUG_ON(tlb_type == hypervisor); #ifdef CONFIG_DEBUG_DCFLUSH atomic_inc(&dcpage_flushes); #endif #ifdef DCACHE_ALIASING_POSSIBLE - __flush_dcache_page(page_address(page), - ((tlb_type == spitfire) && - page_mapping_file(page) != NULL)); + for (i = 0; i < nr; i++) + __flush_dcache_page(folio_address(folio) + i * PAGE_SIZE, + ((tlb_type == spitfire) && + folio_flush_mapping(folio) != NULL)); #else - if (page_mapping_file(page) != NULL && - tlb_type == spitfire) - __flush_icache_page(__pa(page_address(page))); + if (folio_flush_mapping(folio) != NULL && + tlb_type == spitfire) { + for (i = 0; i < nr; i++) + __flush_icache_page((pfn + i) * PAGE_SIZE); + } #endif } @@ -218,10 +223,10 @@ inline void flush_dcache_page_impl(struct page *page) #define PG_dcache_cpu_mask \ ((1UL<flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) +#define dcache_dirty_cpu(folio) \ + (((folio)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask) -static inline void set_dcache_dirty(struct page *page, int this_cpu) +static inline void set_dcache_dirty(struct folio *folio, int this_cpu) { unsigned long mask = this_cpu; unsigned long non_cpu_bits; @@ -238,11 +243,11 @@ static inline void set_dcache_dirty(struct page *page, int this_cpu) "bne,pn %%xcc, 1b\n\t" " nop" : /* no outputs */ - : "r" (mask), "r" (non_cpu_bits), "r" (&page->flags) + : "r" (mask), "r" (non_cpu_bits), "r" (&folio->flags) : "g1", "g7"); } -static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) +static inline void clear_dcache_dirty_cpu(struct folio *folio, unsigned long cpu) { unsigned long mask = (1UL << PG_dcache_dirty); @@ -260,7 +265,7 @@ static inline void clear_dcache_dirty_cpu(struct page *page, unsigned long cpu) " nop\n" "2:" : /* no outputs */ - : "r" (cpu), "r" (mask), "r" (&page->flags), + : "r" (cpu), "r" (mask), "r" (&folio->flags), "i" (PG_dcache_cpu_mask), "i" (PG_dcache_cpu_shift) : "g1", "g7"); @@ -284,9 +289,10 @@ static void flush_dcache(unsigned long pfn) page = pfn_to_page(pfn); if (page) { + struct folio *folio = page_folio(page); unsigned long pg_flags; - pg_flags = page->flags; + pg_flags = folio->flags; if (pg_flags & (1UL << PG_dcache_dirty)) { int cpu = ((pg_flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask); @@ -296,11 +302,11 @@ static void flush_dcache(unsigned long pfn) * in the SMP case. */ if (cpu == this_cpu) - flush_dcache_page_impl(page); + flush_dcache_folio_impl(folio); else - smp_flush_dcache_page_impl(page, cpu); + smp_flush_dcache_folio_impl(folio, cpu); - clear_dcache_dirty_cpu(page, cpu); + clear_dcache_dirty_cpu(folio, cpu); put_cpu(); } @@ -388,12 +394,14 @@ bool __init arch_hugetlb_valid_size(unsigned long size) } #endif /* CONFIG_HUGETLB_PAGE */ -void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) { struct mm_struct *mm; unsigned long flags; bool is_huge_tsb; pte_t pte = *ptep; + unsigned int i; if (tlb_type != hypervisor) { unsigned long pfn = pte_pfn(pte); @@ -440,15 +448,21 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * } } #endif - if (!is_huge_tsb) - __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, - address, pte_val(pte)); + if (!is_huge_tsb) { + for (i = 0; i < nr; i++) { + __update_mmu_tsb_insert(mm, MM_TSB_BASE, PAGE_SHIFT, + address, pte_val(pte)); + address += PAGE_SIZE; + pte_val(pte) += PAGE_SIZE; + } + } spin_unlock_irqrestore(&mm->context.lock, flags); } -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { + unsigned long pfn = folio_pfn(folio); struct address_space *mapping; int this_cpu; @@ -459,35 +473,35 @@ void flush_dcache_page(struct page *page) * is merely the zero page. The 'bigcore' testcase in GDB * causes this case to run millions of times. */ - if (page == ZERO_PAGE(0)) + if (is_zero_pfn(pfn)) return; this_cpu = get_cpu(); - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (mapping && !mapping_mapped(mapping)) { - int dirty = test_bit(PG_dcache_dirty, &page->flags); + bool dirty = test_bit(PG_dcache_dirty, &folio->flags); if (dirty) { - int dirty_cpu = dcache_dirty_cpu(page); + int dirty_cpu = dcache_dirty_cpu(folio); if (dirty_cpu == this_cpu) goto out; - smp_flush_dcache_page_impl(page, dirty_cpu); + smp_flush_dcache_folio_impl(folio, dirty_cpu); } - set_dcache_dirty(page, this_cpu); + set_dcache_dirty(folio, this_cpu); } else { /* We could delay the flush for the !page_mapping * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. */ - flush_dcache_page_impl(page); + flush_dcache_folio_impl(folio); } out: put_cpu(); } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); void __kprobes flush_icache_range(unsigned long start, unsigned long end) { @@ -2280,10 +2294,10 @@ void __init paging_init(void) setup_page_offset(); /* These build time checkes make sure that the dcache_dirty_cpu() - * page->flags usage will work. + * folio->flags usage will work. * * When a page gets marked as dcache-dirty, we store the - * cpu number starting at bit 32 in the page->flags. Also, + * cpu number starting at bit 32 in the folio->flags. Also, * functions like clear_dcache_dirty_cpu use the cpu mask * in 13-bit signed-immediate instruction fields. */ diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7ecf8556947abc..0d41c94ec3ac06 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -118,6 +118,7 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, unsigned long paddr, pfn = pte_pfn(orig); struct address_space *mapping; struct page *page; + struct folio *folio; if (!pfn_valid(pfn)) goto no_cache_flush; @@ -127,13 +128,13 @@ void tlb_batch_add(struct mm_struct *mm, unsigned long vaddr, goto no_cache_flush; /* A real file page? */ - mapping = page_mapping_file(page); + mapping = folio_flush_mapping(folio); if (!mapping) goto no_cache_flush; paddr = (unsigned long) page_address(page); if ((paddr ^ vaddr) & (1 << 13)) - flush_dcache_page_all(mm, page); + flush_dcache_folio_all(mm, folio); } no_cache_flush: From fd8132e6e9fdecb9ff7d1db98014d372e03f3c9d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:55 +0100 Subject: [PATCH 456/489] um: implement the new page table range API Add PFN_PTE_SHIFT and update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Richard Weinberger Cc: Anton Ivanov Cc: Johannes Berg Signed-off-by: Andrew Morton --- arch/um/include/asm/pgtable.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h index a70d1618eb35f2..44f6c76167d989 100644 --- a/arch/um/include/asm/pgtable.h +++ b/arch/um/include/asm/pgtable.h @@ -242,11 +242,7 @@ static inline void set_pte(pte_t *pteptr, pte_t pteval) if(pte_present(*pteptr)) *pteptr = pte_mknewprot(*pteptr); } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *pteptr, pte_t pteval) -{ - set_pte(pteptr, pteval); -} +#define PFN_PTE_SHIFT PAGE_SHIFT #define __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) @@ -290,6 +286,7 @@ struct mm_struct; extern pte_t *virt_to_pte(struct mm_struct *mm, unsigned long addr); #define update_mmu_cache(vma,address,ptep) do {} while (0) +#define update_mmu_cache_range(vmf, vma, address, ptep, nr) do {} while (0) /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that From a3e1c9372c9b95945aa58f318990cf3e4bf2881d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:56 +0100 Subject: [PATCH 457/489] x86: implement the new page table range API Add PFN_PTE_SHIFT and a noop update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-29-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index cd0b6337d03cea..dbf8af70b7c2e1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -185,6 +185,8 @@ static inline int pte_special(pte_t pte) static inline u64 protnone_mask(u64 val); +#define PFN_PTE_SHIFT PAGE_SHIFT + static inline unsigned long pte_pfn(pte_t pte) { phys_addr_t pfn = pte_val(pte); @@ -1020,13 +1022,6 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) return res; } -static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pte) -{ - page_table_check_ptes_set(mm, ptep, pte, 1); - set_pte(ptep, pte); -} - static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { @@ -1292,6 +1287,11 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { } +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long addr, + pte_t *ptep, unsigned int nr) +{ +} static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) { From 4fbb7e7f47dbc631a9f5bad3171ccbca171ed1d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:57 +0100 Subject: [PATCH 458/489] xtensa: implement the new page table range API Add PFN_PTE_SHIFT, update_mmu_cache_range(), flush_dcache_folio() and flush_icache_pages(). Link: https://lkml.kernel.org/r/20230802151406.3735276-30-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Max Filippov Signed-off-by: Andrew Morton --- arch/xtensa/include/asm/cacheflush.h | 9 ++- arch/xtensa/include/asm/pgtable.h | 18 +++--- arch/xtensa/mm/cache.c | 83 ++++++++++++++++------------ 3 files changed, 63 insertions(+), 47 deletions(-) diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index 7b4359312c2576..35153f6725e4fa 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -119,8 +119,14 @@ void flush_cache_page(struct vm_area_struct*, #define flush_cache_vmap(start,end) flush_cache_all() #define flush_cache_vunmap(start,end) flush_cache_all() +void flush_dcache_folio(struct folio *folio); +#define flush_dcache_folio flush_dcache_folio + #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 -void flush_dcache_page(struct page *); +static inline void flush_dcache_page(struct page *page) +{ + flush_dcache_folio(page_folio(page)); +} void local_flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); @@ -156,6 +162,7 @@ void local_flush_cache_page(struct vm_area_struct *vma, /* This is not required, see Documentation/core-api/cachetlb.rst */ #define flush_icache_page(vma,page) do { } while (0) +#define flush_icache_pages(vma, page, nr) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/arch/xtensa/include/asm/pgtable.h b/arch/xtensa/include/asm/pgtable.h index fc7a14884c6c3b..ef79cb6c20dc33 100644 --- a/arch/xtensa/include/asm/pgtable.h +++ b/arch/xtensa/include/asm/pgtable.h @@ -274,6 +274,7 @@ static inline pte_t pte_mkwrite(pte_t pte) * and a page entry and page directory to the page they refer to. */ +#define PFN_PTE_SHIFT PAGE_SHIFT #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) #define pte_same(a,b) (pte_val(a) == pte_val(b)) #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -301,15 +302,9 @@ static inline void update_pte(pte_t *ptep, pte_t pteval) struct mm_struct; -static inline void -set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) -{ - update_pte(ptep, pteval); -} - -static inline void set_pte(pte_t *ptep, pte_t pteval) +static inline void set_pte(pte_t *ptep, pte_t pte) { - update_pte(ptep, pteval); + update_pte(ptep, pte); } static inline void @@ -407,8 +402,11 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) #else -extern void update_mmu_cache(struct vm_area_struct * vma, - unsigned long address, pte_t *ptep); +struct vm_fault; +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr); +#define update_mmu_cache(vma, address, ptep) \ + update_mmu_cache_range(NULL, vma, address, ptep, 1) typedef pte_t *pte_addr_t; diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 19e5a478a7e8b0..7ec66a79f4723c 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -121,9 +121,9 @@ EXPORT_SYMBOL(copy_user_highpage); * */ -void flush_dcache_page(struct page *page) +void flush_dcache_folio(struct folio *folio) { - struct address_space *mapping = page_mapping_file(page); + struct address_space *mapping = folio_flush_mapping(folio); /* * If we have a mapping but the page is not mapped to user-space @@ -132,14 +132,14 @@ void flush_dcache_page(struct page *page) */ if (mapping && !mapping_mapped(mapping)) { - if (!test_bit(PG_arch_1, &page->flags)) - set_bit(PG_arch_1, &page->flags); + if (!test_bit(PG_arch_1, &folio->flags)) + set_bit(PG_arch_1, &folio->flags); return; } else { - - unsigned long phys = page_to_phys(page); - unsigned long temp = page->index << PAGE_SHIFT; + unsigned long phys = folio_pfn(folio) * PAGE_SIZE; + unsigned long temp = folio_pos(folio); + unsigned int i, nr = folio_nr_pages(folio); unsigned long alias = !(DCACHE_ALIAS_EQ(temp, phys)); unsigned long virt; @@ -154,22 +154,26 @@ void flush_dcache_page(struct page *page) return; preempt_disable(); - virt = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); - __flush_invalidate_dcache_page_alias(virt, phys); + for (i = 0; i < nr; i++) { + virt = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); + __flush_invalidate_dcache_page_alias(virt, phys); - virt = TLBTEMP_BASE_1 + (temp & DCACHE_ALIAS_MASK); + virt = TLBTEMP_BASE_1 + (temp & DCACHE_ALIAS_MASK); - if (alias) - __flush_invalidate_dcache_page_alias(virt, phys); + if (alias) + __flush_invalidate_dcache_page_alias(virt, phys); - if (mapping) - __invalidate_icache_page_alias(virt, phys); + if (mapping) + __invalidate_icache_page_alias(virt, phys); + phys += PAGE_SIZE; + temp += PAGE_SIZE; + } preempt_enable(); } /* There shouldn't be an entry in the cache for this page anymore. */ } -EXPORT_SYMBOL(flush_dcache_page); +EXPORT_SYMBOL(flush_dcache_folio); /* * For now, flush the whole cache. FIXME?? @@ -207,45 +211,52 @@ EXPORT_SYMBOL(local_flush_cache_page); #endif /* DCACHE_WAY_SIZE > PAGE_SIZE */ -void -update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep) +void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, unsigned int nr) { unsigned long pfn = pte_pfn(*ptep); - struct page *page; + struct folio *folio; + unsigned int i; if (!pfn_valid(pfn)) return; - page = pfn_to_page(pfn); + folio = page_folio(pfn_to_page(pfn)); - /* Invalidate old entry in TLBs */ - - flush_tlb_page(vma, addr); + /* Invalidate old entries in TLBs */ + for (i = 0; i < nr; i++) + flush_tlb_page(vma, addr + i * PAGE_SIZE); + nr = folio_nr_pages(folio); #if (DCACHE_WAY_SIZE > PAGE_SIZE) - if (!PageReserved(page) && test_bit(PG_arch_1, &page->flags)) { - unsigned long phys = page_to_phys(page); + if (!folio_test_reserved(folio) && test_bit(PG_arch_1, &folio->flags)) { + unsigned long phys = folio_pfn(folio) * PAGE_SIZE; unsigned long tmp; preempt_disable(); - tmp = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); - __flush_invalidate_dcache_page_alias(tmp, phys); - tmp = TLBTEMP_BASE_1 + (addr & DCACHE_ALIAS_MASK); - __flush_invalidate_dcache_page_alias(tmp, phys); - __invalidate_icache_page_alias(tmp, phys); + for (i = 0; i < nr; i++) { + tmp = TLBTEMP_BASE_1 + (phys & DCACHE_ALIAS_MASK); + __flush_invalidate_dcache_page_alias(tmp, phys); + tmp = TLBTEMP_BASE_1 + (addr & DCACHE_ALIAS_MASK); + __flush_invalidate_dcache_page_alias(tmp, phys); + __invalidate_icache_page_alias(tmp, phys); + phys += PAGE_SIZE; + } preempt_enable(); - clear_bit(PG_arch_1, &page->flags); + clear_bit(PG_arch_1, &folio->flags); } #else - if (!PageReserved(page) && !test_bit(PG_arch_1, &page->flags) + if (!folio_test_reserved(folio) && !test_bit(PG_arch_1, &folio->flags) && (vma->vm_flags & VM_EXEC) != 0) { - unsigned long paddr = (unsigned long)kmap_atomic(page); - __flush_dcache_page(paddr); - __invalidate_icache_page(paddr); - set_bit(PG_arch_1, &page->flags); - kunmap_atomic((void *)paddr); + for (i = 0; i < nr; i++) { + void *paddr = kmap_local_folio(folio, i * PAGE_SIZE); + __flush_dcache_page((unsigned long)paddr); + __invalidate_icache_page((unsigned long)paddr); + kunmap_local(paddr); + } + set_bit(PG_arch_1, &folio->flags); } #endif } From 29269ad90bed55a9ece007a7ece53f4505df3781 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:58 +0100 Subject: [PATCH 459/489] mm: remove page_mapping_file() This function has no more users. Link: https://lkml.kernel.org/r/20230802151406.3735276-31-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 88d161887cf22d..b5c4c8beefe20f 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -414,14 +414,6 @@ static inline struct address_space *page_file_mapping(struct page *page) return folio_file_mapping(page_folio(page)); } -/* - * For file cache pages, return the address_space, otherwise return NULL - */ -static inline struct address_space *page_mapping_file(struct page *page) -{ - return folio_flush_mapping(page_folio(page)); -} - /** * folio_inode - Get the host inode for this folio. * @folio: The folio. From 203b7b6aad6769a43987deb81c35456de8bb16c7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:13:59 +0100 Subject: [PATCH 460/489] mm: rationalise flush_icache_pages() and flush_icache_page() Move the default (no-op) implementation of flush_icache_pages() to from . Remove the flush_icache_page() wrapper from each architecture into . Link: https://lkml.kernel.org/r/20230802151406.3735276-32-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- arch/alpha/include/asm/cacheflush.h | 5 +---- arch/arc/include/asm/cacheflush.h | 9 --------- arch/arm/include/asm/cacheflush.h | 7 ------- arch/csky/abiv1/inc/abi/cacheflush.h | 1 - arch/csky/abiv2/inc/abi/cacheflush.h | 1 - arch/hexagon/include/asm/cacheflush.h | 2 +- arch/loongarch/include/asm/cacheflush.h | 2 -- arch/m68k/include/asm/cacheflush_mm.h | 1 - arch/mips/include/asm/cacheflush.h | 6 ------ arch/nios2/include/asm/cacheflush.h | 2 +- arch/parisc/include/asm/cacheflush.h | 2 +- arch/sh/include/asm/cacheflush.h | 2 +- arch/sparc/include/asm/cacheflush_32.h | 2 -- arch/sparc/include/asm/cacheflush_64.h | 3 --- arch/xtensa/include/asm/cacheflush.h | 4 ---- include/asm-generic/cacheflush.h | 12 ------------ include/linux/cacheflush.h | 9 +++++++++ 17 files changed, 14 insertions(+), 56 deletions(-) diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 3956460e69e277..36a7e924c3b98a 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -53,10 +53,6 @@ extern void flush_icache_user_page(struct vm_area_struct *vma, #define flush_icache_user_page flush_icache_user_page #endif /* CONFIG_SMP */ -/* This is used only in __do_fault and do_swap_page. */ -#define flush_icache_page(vma, page) \ - flush_icache_user_page((vma), (page), 0, 0) - /* * Both implementations of flush_icache_user_page flush the entire * address space, so one call, no matter how many pages. @@ -66,6 +62,7 @@ static inline void flush_icache_pages(struct vm_area_struct *vma, { flush_icache_user_page(vma, page, 0, 0); } +#define flush_icache_pages flush_icache_pages #include diff --git a/arch/arc/include/asm/cacheflush.h b/arch/arc/include/asm/cacheflush.h index 04f65f58851070..bd5b1a9a054402 100644 --- a/arch/arc/include/asm/cacheflush.h +++ b/arch/arc/include/asm/cacheflush.h @@ -18,15 +18,6 @@ #include #include -/* - * Semantically we need this because icache doesn't snoop dcache/dma. - * However ARC Cache flush requires paddr as well as vaddr, latter not available - * in the flush_icache_page() API. So we no-op it but do the equivalent work - * in update_mmu_cache() - */ -#define flush_icache_page(vma, page) -#define flush_icache_pages(vma, page, nr) - void flush_cache_all(void); void flush_icache_range(unsigned long kstart, unsigned long kend); diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 841e268d23747c..f6181f69577fe5 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -321,13 +321,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma,page) do { } while (0) -#define flush_icache_pages(vma, page, nr) do { } while (0) - /* * flush_cache_vmap() is used when creating mappings (eg, via vmap, * vmalloc, ioremap etc) in kernel space for pages. On non-VIPT diff --git a/arch/csky/abiv1/inc/abi/cacheflush.h b/arch/csky/abiv1/inc/abi/cacheflush.h index 0d6cb65624c43b..908d8b0bc4fdc6 100644 --- a/arch/csky/abiv1/inc/abi/cacheflush.h +++ b/arch/csky/abiv1/inc/abi/cacheflush.h @@ -45,7 +45,6 @@ extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, u #define flush_cache_vmap(start, end) cache_wbinv_all() #define flush_cache_vunmap(start, end) cache_wbinv_all() -#define flush_icache_page(vma, page) do {} while (0); #define flush_icache_range(start, end) cache_wbinv_range(start, end) #define flush_icache_mm_range(mm, start, end) cache_wbinv_range(start, end) #define flush_icache_deferred(mm) do {} while (0); diff --git a/arch/csky/abiv2/inc/abi/cacheflush.h b/arch/csky/abiv2/inc/abi/cacheflush.h index 9c728933a7764d..40be16907267d6 100644 --- a/arch/csky/abiv2/inc/abi/cacheflush.h +++ b/arch/csky/abiv2/inc/abi/cacheflush.h @@ -33,7 +33,6 @@ static inline void flush_dcache_page(struct page *page) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) #define flush_icache_range(start, end) cache_wbinv_range(start, end) diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index dc3f500a5a01bd..bfff514a81c85e 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -18,7 +18,7 @@ * - flush_cache_range(vma, start, end) flushes a range of pages * - flush_icache_range(start, end) flush a range of instructions * - flush_dcache_page(pg) flushes(wback&invalidates) a page for dcache - * - flush_icache_page(vma, pg) flushes(invalidates) a page for icache + * - flush_icache_pages(vma, pg, nr) flushes(invalidates) nr pages for icache * * Need to doublecheck which one is really needed for ptrace stuff to work. */ diff --git a/arch/loongarch/include/asm/cacheflush.h b/arch/loongarch/include/asm/cacheflush.h index 88a44da50a3b17..80bd74106985a9 100644 --- a/arch/loongarch/include/asm/cacheflush.h +++ b/arch/loongarch/include/asm/cacheflush.h @@ -46,8 +46,6 @@ void local_flush_icache_range(unsigned long start, unsigned long end); #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) #define flush_cache_vmap(start, end) do { } while (0) #define flush_cache_vunmap(start, end) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) -#define flush_icache_pages(vma, page) do { } while (0) #define flush_icache_user_page(vma, page, addr, len) do { } while (0) #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 88eb85e81ef681..ed12358c4783b4 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -261,7 +261,6 @@ static inline void __flush_pages_to_ram(void *vaddr, unsigned int nr) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_pages(vma, page, nr) \ __flush_pages_to_ram(page_address(page), nr) -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); diff --git a/arch/mips/include/asm/cacheflush.h b/arch/mips/include/asm/cacheflush.h index 0f389bc7cb903a..f36c2519ed9768 100644 --- a/arch/mips/include/asm/cacheflush.h +++ b/arch/mips/include/asm/cacheflush.h @@ -82,12 +82,6 @@ static inline void flush_anon_page(struct vm_area_struct *vma, __flush_anon_page(page, vmaddr); } -static inline void flush_icache_pages(struct vm_area_struct *vma, - struct page *page, unsigned int nr) -{ -} -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) - extern void (*flush_icache_range)(unsigned long start, unsigned long end); extern void (*local_flush_icache_range)(unsigned long start, unsigned long end); extern void (*__flush_icache_user_range)(unsigned long start, diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 8624ca83cffe1b..7c48c5213fb7a3 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -35,7 +35,7 @@ void flush_dcache_folio(struct folio *folio); extern void flush_icache_range(unsigned long start, unsigned long end); void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr); -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1); +#define flush_icache_pages flush_icache_pages #define flush_cache_vmap(start, end) flush_dcache_range(start, end) #define flush_cache_vunmap(start, end) flush_dcache_range(start, end) diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index b77c3e0c37d381..b4006f2a97052d 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -60,7 +60,7 @@ static inline void flush_dcache_page(struct page *page) void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr); -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) +#define flush_icache_pages flush_icache_pages #define flush_icache_range(s,e) do { \ flush_kernel_dcache_range_asm(s,e); \ diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 9fceef6f3e002b..878b6b551bd2d0 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -53,7 +53,7 @@ extern void flush_icache_range(unsigned long start, unsigned long end); #define flush_icache_user_range flush_icache_range void flush_icache_pages(struct vm_area_struct *vma, struct page *page, unsigned int nr); -#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) +#define flush_icache_pages flush_icache_pages extern void flush_cache_sigtramp(unsigned long address); struct flusher_data { diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index c8dd971f0e881b..f3b7270bf71b26 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -16,8 +16,6 @@ #define flush_cache_page(vma,addr,pfn) \ sparc32_cachetlb_ops->cache_page(vma, addr) #define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_pages(vma, pg, nr) do { } while (0) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index a9a719f04d06b1..0e879004efff16 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -53,9 +53,6 @@ static inline void flush_dcache_page(struct page *page) flush_dcache_folio(page_folio(page)); } -#define flush_icache_page(vma, pg) do { } while(0) -#define flush_icache_pages(vma, pg, nr) do { } while(0) - void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, unsigned long len, int write); diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index 35153f6725e4fa..785a00ce83c11e 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -160,10 +160,6 @@ void local_flush_cache_page(struct vm_area_struct *vma, __invalidate_icache_range(start,(end) - (start)); \ } while (0) -/* This is not required, see Documentation/core-api/cachetlb.rst */ -#define flush_icache_page(vma,page) do { } while (0) -#define flush_icache_pages(vma, page, nr) do { } while (0) - #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 09d51a68076595..84ec53ccc45029 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -77,18 +77,6 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) #define flush_icache_user_range flush_icache_range #endif -#ifndef flush_icache_page -static inline void flush_icache_pages(struct vm_area_struct *vma, - struct page *page, unsigned int nr) -{ -} - -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) -{ -} -#endif - #ifndef flush_icache_user_page static inline void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, diff --git a/include/linux/cacheflush.h b/include/linux/cacheflush.h index 82136f3fcf54a1..55f297b2c23f49 100644 --- a/include/linux/cacheflush.h +++ b/include/linux/cacheflush.h @@ -17,4 +17,13 @@ static inline void flush_dcache_folio(struct folio *folio) #define flush_dcache_folio flush_dcache_folio #endif /* ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE */ +#ifndef flush_icache_pages +static inline void flush_icache_pages(struct vm_area_struct *vma, + struct page *page, unsigned int nr) +{ +} +#endif + +#define flush_icache_page(vma, page) flush_icache_pages(vma, page, 1) + #endif /* _LINUX_CACHEFLUSH_H */ From af4fcb0729329cc4b3f6977d7f75562a00174bd1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:14:00 +0100 Subject: [PATCH 461/489] mm: tidy up set_ptes definition Now that all architectures are converted, we can remove the PFN_PTE_SHIFT ifdef and we can define set_pte_at() unconditionally. Link: https://lkml.kernel.org/r/20230802151406.3735276-33-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 81c3f7decb1c64..fc811c9b421ab8 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -204,7 +204,6 @@ static inline int pmd_young(pmd_t pmd) #endif #ifndef set_ptes -#ifdef PFN_PTE_SHIFT /** * set_ptes - Map consecutive pages to a contiguous range of addresses. * @mm: Address space to map the pages into. @@ -234,13 +233,8 @@ static inline void set_ptes(struct mm_struct *mm, unsigned long addr, } arch_leave_lazy_mmu_mode(); } -#ifndef set_pte_at -#define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -#endif #endif -#else #define set_pte_at(mm, addr, ptep, pte) set_ptes(mm, addr, ptep, pte, 1) -#endif #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, From 9f1f5b60e76d44fa85fef6970b7477f72d3999eb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:14:01 +0100 Subject: [PATCH 462/489] mm: use flush_icache_pages() in do_set_pmd() Push the iteration over each page down to the architectures (many can flush the entire THP without iteration). Link: https://lkml.kernel.org/r/20230802151406.3735276-34-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/memory.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2c6f45d18b73f5..fbb7f066bfb6d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4265,7 +4265,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; - int i; vm_fault_t ret = VM_FAULT_FALLBACK; if (!transhuge_vma_suitable(vma, haddr)) @@ -4298,8 +4297,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (unlikely(!pmd_none(*vmf->pmd))) goto out; - for (i = 0; i < HPAGE_PMD_NR; i++) - flush_icache_page(vma, page + i); + flush_icache_pages(vma, page, HPAGE_PMD_NR); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) From de74976eb65151a2f568e477fc2e0032df5b22b4 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:02 +0100 Subject: [PATCH 463/489] filemap: add filemap_map_folio_range() filemap_map_folio_range() maps partial/full folio. Comparing to original filemap_map_pages(), it updates refcount once per folio instead of per page and gets minor performance improvement for large folio. With a will-it-scale.page_fault3 like app (change file write fault testing to read fault testing. Trying to upstream it to will-it-scale at [1]), got 2% performance gain on a 48C/96T Cascade Lake test box with 96 processes running against xfs. [1]: https://github.com/antonblanchard/will-it-scale/pull/37 Link: https://lkml.kernel.org/r/20230802151406.3735276-35-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 109 ++++++++++++++++++++++++++------------------------- 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 8040545954bc41..bdc1e0b811bf0b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2168,16 +2168,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, } EXPORT_SYMBOL(filemap_get_folios); -static inline -bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max) -{ - if (!folio_test_large(folio) || folio_test_hugetlb(folio)) - return false; - if (index >= max) - return false; - return index < folio_next_index(folio) - 1; -} - /** * filemap_get_folios_contig - Get a batch of contiguous folios * @mapping: The address_space to search @@ -3436,10 +3426,10 @@ static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio, return false; } -static struct folio *next_uptodate_page(struct folio *folio, - struct address_space *mapping, - struct xa_state *xas, pgoff_t end_pgoff) +static struct folio *next_uptodate_folio(struct xa_state *xas, + struct address_space *mapping, pgoff_t end_pgoff) { + struct folio *folio = xas_next_entry(xas, end_pgoff); unsigned long max_idx; do { @@ -3477,20 +3467,51 @@ static struct folio *next_uptodate_page(struct folio *folio, return NULL; } -static inline struct folio *first_map_page(struct address_space *mapping, - struct xa_state *xas, - pgoff_t end_pgoff) +/* + * Map page range [start_page, start_page + nr_pages) of folio. + * start_page is gotten from start by folio_page(folio, start) + */ +static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, + struct folio *folio, unsigned long start, + unsigned long addr, unsigned int nr_pages) { - return next_uptodate_page(xas_find(xas, end_pgoff), - mapping, xas, end_pgoff); -} + vm_fault_t ret = 0; + struct vm_area_struct *vma = vmf->vma; + struct file *file = vma->vm_file; + struct page *page = folio_page(folio, start); + unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); + unsigned int ref_count = 0, count = 0; -static inline struct folio *next_map_page(struct address_space *mapping, - struct xa_state *xas, - pgoff_t end_pgoff) -{ - return next_uptodate_page(xas_next_entry(xas, end_pgoff), - mapping, xas, end_pgoff); + do { + if (PageHWPoison(page)) + continue; + + if (mmap_miss > 0) + mmap_miss--; + + /* + * NOTE: If there're PTE markers, we'll leave them to be + * handled in the specific fault path, and it'll prohibit the + * fault-around logic. + */ + if (!pte_none(*vmf->pte)) + continue; + + if (vmf->address == addr) + ret = VM_FAULT_NOPAGE; + + ref_count++; + do_set_pte(vmf, page, addr); + update_mmu_cache(vma, addr, vmf->pte); + } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); + + /* Restore the vmf->pte */ + vmf->pte -= nr_pages; + + folio_ref_add(folio, ref_count); + WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); + + return ret; } vm_fault_t filemap_map_pages(struct vm_fault *vmf, @@ -3503,12 +3524,11 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, unsigned long addr; XA_STATE(xas, &mapping->i_pages, start_pgoff); struct folio *folio; - struct page *page; - unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); vm_fault_t ret = 0; + int nr_pages = 0; rcu_read_lock(); - folio = first_map_page(mapping, &xas, end_pgoff); + folio = next_uptodate_folio(&xas, mapping, end_pgoff); if (!folio) goto out; @@ -3525,17 +3545,13 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } do { -again: - page = folio_file_page(folio, xas.xa_index); - if (PageHWPoison(page)) - goto unlock; - - if (mmap_miss > 0) - mmap_miss--; + unsigned long end; addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT; vmf->pte += xas.xa_index - last_pgoff; last_pgoff = xas.xa_index; + end = folio->index + folio_nr_pages(folio) - 1; + nr_pages = min(end, end_pgoff) - xas.xa_index + 1; /* * NOTE: If there're PTE markers, we'll leave them to be @@ -3545,32 +3561,17 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!pte_none(ptep_get(vmf->pte))) goto unlock; - /* We're about to handle the fault */ - if (vmf->address == addr) - ret = VM_FAULT_NOPAGE; + ret |= filemap_map_folio_range(vmf, folio, + xas.xa_index - folio->index, addr, nr_pages); - do_set_pte(vmf, page, addr); - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, addr, vmf->pte); - if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { - xas.xa_index++; - folio_ref_inc(folio); - goto again; - } - folio_unlock(folio); - continue; unlock: - if (folio_more_pages(folio, xas.xa_index, end_pgoff)) { - xas.xa_index++; - goto again; - } folio_unlock(folio); folio_put(folio); - } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); + folio = next_uptodate_folio(&xas, mapping, end_pgoff); + } while (folio); pte_unmap_unlock(vmf->pte, vmf->ptl); out: rcu_read_unlock(); - WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; } EXPORT_SYMBOL(filemap_map_pages); From 86f35f69db8e7d169c36472a349507ab0a461f49 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:03 +0100 Subject: [PATCH 464/489] rmap: add folio_add_file_rmap_range() folio_add_file_rmap_range() allows to add pte mapping to a specific range of file folio. Comparing to page_add_file_rmap(), it batched updates __lruvec_stat for large folio. Link: https://lkml.kernel.org/r/20230802151406.3735276-36-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 ++ mm/rmap.c | 60 +++++++++++++++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 14 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b87d0166041273..a3825ce81102cf 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, unsigned long address); void page_add_file_rmap(struct page *, struct vm_area_struct *, bool compound); +void folio_add_file_rmap_range(struct folio *, struct page *, unsigned int nr, + struct vm_area_struct *, bool compound); void page_remove_rmap(struct page *, struct vm_area_struct *, bool compound); diff --git a/mm/rmap.c b/mm/rmap.c index 51ec8aa5e61f2d..1f04debdc87a03 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1294,31 +1294,39 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma, } /** - * page_add_file_rmap - add pte mapping to a file page - * @page: the page to add the mapping to + * folio_add_file_rmap_range - add pte mapping to page range of a folio + * @folio: The folio to add the mapping to + * @page: The first page to add + * @nr_pages: The number of pages which will be mapped * @vma: the vm area in which the mapping is added * @compound: charge the page as compound or small page * + * The page range of folio is defined by [first_page, first_page + nr_pages) + * * The caller needs to hold the pte lock. */ -void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, - bool compound) +void folio_add_file_rmap_range(struct folio *folio, struct page *page, + unsigned int nr_pages, struct vm_area_struct *vma, + bool compound) { - struct folio *folio = page_folio(page); atomic_t *mapped = &folio->_nr_pages_mapped; - int nr = 0, nr_pmdmapped = 0; - bool first; + unsigned int nr_pmdmapped = 0, first; + int nr = 0; - VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page); + VM_WARN_ON_FOLIO(compound && !folio_test_pmd_mappable(folio), folio); /* Is page being mapped by PTE? Is this its first map to be added? */ if (likely(!compound)) { - first = atomic_inc_and_test(&page->_mapcount); - nr = first; - if (first && folio_test_large(folio)) { - nr = atomic_inc_return_relaxed(mapped); - nr = (nr < COMPOUND_MAPPED); - } + do { + first = atomic_inc_and_test(&page->_mapcount); + if (first && folio_test_large(folio)) { + first = atomic_inc_return_relaxed(mapped); + first = (first < COMPOUND_MAPPED); + } + + if (first) + nr++; + } while (page++, --nr_pages > 0); } else if (folio_test_pmd_mappable(folio)) { /* That test is redundant: it's for safety or to optimize out */ @@ -1347,6 +1355,30 @@ void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, mlock_vma_folio(folio, vma, compound); } +/** + * page_add_file_rmap - add pte mapping to a file page + * @page: the page to add the mapping to + * @vma: the vm area in which the mapping is added + * @compound: charge the page as compound or small page + * + * The caller needs to hold the pte lock. + */ +void page_add_file_rmap(struct page *page, struct vm_area_struct *vma, + bool compound) +{ + struct folio *folio = page_folio(page); + unsigned int nr_pages; + + VM_WARN_ON_ONCE_PAGE(compound && !PageTransHuge(page), page); + + if (likely(!compound)) + nr_pages = 1; + else + nr_pages = folio_nr_pages(folio); + + folio_add_file_rmap_range(folio, page, nr_pages, vma, compound); +} + /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from From 3bd786f76de2e01745f462844fd1a206052ee8b8 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:04 +0100 Subject: [PATCH 465/489] mm: convert do_set_pte() to set_pte_range() set_pte_range() allows to setup page table entries for a specific range. It takes advantage of batched rmap update for large folio. It now takes care of calling update_mmu_cache_range(). Link: https://lkml.kernel.org/r/20230802151406.3735276-37-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 2 +- include/linux/mm.h | 3 ++- mm/filemap.c | 3 +-- mm/memory.c | 37 +++++++++++++++++---------- 4 files changed, 28 insertions(+), 17 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index ed148919e11ad5..211a0305399223 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -661,7 +661,7 @@ locked. The VM will unlock the page. Filesystem should find and map pages associated with offsets from "start_pgoff" till "end_pgoff". ->map_pages() is called with the RCU lock held and must not block. If it's not possible to reach a page without blocking, -filesystem should skip it. Filesystem should use do_set_pte() to setup +filesystem should skip it. Filesystem should use set_pte_range() to setup page table entry. Pointer to entry associated with the page is passed in "pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". diff --git a/include/linux/mm.h b/include/linux/mm.h index c1db400e83cb0f..ddb95967ba64c1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1322,7 +1322,8 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page); -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr); +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); diff --git a/mm/filemap.c b/mm/filemap.c index bdc1e0b811bf0b..c06e9d33141661 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3501,8 +3501,7 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, ret = VM_FAULT_NOPAGE; ref_count++; - do_set_pte(vmf, page, addr); - update_mmu_cache(vma, addr, vmf->pte); + set_pte_range(vmf, folio, page, 1, addr); } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); /* Restore the vmf->pte */ diff --git a/mm/memory.c b/mm/memory.c index fbb7f066bfb6d1..12b385eaf3534e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4330,15 +4330,24 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) } #endif -void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) +/** + * set_pte_range - Set a range of PTEs to point to pages in a folio. + * @vmf: Fault decription. + * @folio: The folio that contains @page. + * @page: The first page to create a PTE for. + * @nr: The number of PTEs to create. + * @addr: The first address to create a PTE for. + */ +void set_pte_range(struct vm_fault *vmf, struct folio *folio, + struct page *page, unsigned int nr, unsigned long addr) { struct vm_area_struct *vma = vmf->vma; bool uffd_wp = vmf_orig_pte_uffd_wp(vmf); bool write = vmf->flags & FAULT_FLAG_WRITE; - bool prefault = vmf->address != addr; + bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE); pte_t entry; - flush_icache_page(vma, page); + flush_icache_pages(vma, page, nr); entry = mk_pte(page, vma->vm_page_prot); if (prefault && arch_wants_old_prefaulted_pte()) @@ -4352,14 +4361,18 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) entry = pte_mkuffd_wp(entry); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { - inc_mm_counter(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, addr); - lru_cache_add_inactive_or_unevictable(page, vma); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr); + VM_BUG_ON_FOLIO(nr != 1, folio); + folio_add_new_anon_rmap(folio, vma, addr); + folio_add_lru_vma(folio, vma); } else { - inc_mm_counter(vma->vm_mm, mm_counter_file(page)); - page_add_file_rmap(page, vma, false); + add_mm_counter(vma->vm_mm, mm_counter_file(page), nr); + folio_add_file_rmap_range(folio, page, nr, vma, false); } - set_pte_at(vma->vm_mm, addr, vmf->pte, entry); + set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr); + + /* no need to invalidate: a not-present page won't be cached */ + update_mmu_cache_range(vmf, vma, addr, vmf->pte, nr); } static bool vmf_pte_changed(struct vm_fault *vmf) @@ -4427,11 +4440,9 @@ vm_fault_t finish_fault(struct vm_fault *vmf) /* Re-check under ptl */ if (likely(!vmf_pte_changed(vmf))) { - do_set_pte(vmf, page, vmf->address); - - /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, vmf->address, vmf->pte); + struct folio *folio = page_folio(page); + set_pte_range(vmf, folio, page, 1, vmf->address); ret = 0; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); From 617c28ecab22d98a3809370eb6cb50fa24b7bfe1 Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 2 Aug 2023 16:14:05 +0100 Subject: [PATCH 466/489] filemap: batch PTE mappings Call set_pte_range() once per contiguous range of the folio instead of once per page. This batches the updates to mm counters and the rmap. With a will-it-scale.page_fault3 like app (change file write fault testing to read fault testing. Trying to upstream it to will-it-scale at [1]) got 15% performance gain on a 48C/96T Cascade Lake test box with 96 processes running against xfs. Perf data collected before/after the change: 18.73%--page_add_file_rmap | --11.60%--__mod_lruvec_page_state | |--7.40%--__mod_memcg_lruvec_state | | | --5.58%--cgroup_rstat_updated | --2.53%--__mod_lruvec_state | --1.48%--__mod_node_page_state 9.93%--page_add_file_rmap_range | --2.67%--__mod_lruvec_page_state | |--1.95%--__mod_memcg_lruvec_state | | | --1.57%--cgroup_rstat_updated | --0.61%--__mod_lruvec_state | --0.54%--__mod_node_page_state The running time of __mode_lruvec_page_state() is reduced about 9%. [1]: https://github.com/antonblanchard/will-it-scale/pull/37 Link: https://lkml.kernel.org/r/20230802151406.3735276-38-willy@infradead.org Signed-off-by: Yin Fengwei Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c06e9d33141661..014b73eb96a1d1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3480,11 +3480,12 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct file *file = vma->vm_file; struct page *page = folio_page(folio, start); unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss); - unsigned int ref_count = 0, count = 0; + unsigned int count = 0; + pte_t *old_ptep = vmf->pte; do { - if (PageHWPoison(page)) - continue; + if (PageHWPoison(page + count)) + goto skip; if (mmap_miss > 0) mmap_miss--; @@ -3494,20 +3495,34 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, * handled in the specific fault path, and it'll prohibit the * fault-around logic. */ - if (!pte_none(*vmf->pte)) - continue; - - if (vmf->address == addr) - ret = VM_FAULT_NOPAGE; + if (!pte_none(vmf->pte[count])) + goto skip; - ref_count++; - set_pte_range(vmf, folio, page, 1, addr); - } while (vmf->pte++, page++, addr += PAGE_SIZE, ++count < nr_pages); + count++; + continue; +skip: + if (count) { + set_pte_range(vmf, folio, page, count, addr); + folio_ref_add(folio, count); + if (in_range(vmf->address, addr, count)) + ret = VM_FAULT_NOPAGE; + } - /* Restore the vmf->pte */ - vmf->pte -= nr_pages; + count++; + page += count; + vmf->pte += count; + addr += count * PAGE_SIZE; + count = 0; + } while (--nr_pages > 0); + + if (count) { + set_pte_range(vmf, folio, page, count, addr); + folio_ref_add(folio, count); + if (in_range(vmf->address, addr, count)) + ret = VM_FAULT_NOPAGE; + } - folio_ref_add(folio, ref_count); + vmf->pte = old_ptep; WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss); return ret; From 5003a2bdf6880dc9c301f555bece1154081158fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Aug 2023 16:14:06 +0100 Subject: [PATCH 467/489] mm: call update_mmu_cache_range() in more page fault handling paths Pass the vm_fault to the architecture to help it make smarter decisions about which PTEs to insert into the TLB. Link: https://lkml.kernel.org/r/20230802151406.3735276-39-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 12b385eaf3534e..9d7fb721a680a0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2862,7 +2862,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src, entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) - update_mmu_cache(vma, addr, vmf->pte); + update_mmu_cache_range(vmf, vma, addr, vmf->pte, 1); } /* @@ -3039,7 +3039,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } @@ -3163,7 +3163,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) */ BUG_ON(unshare && pte_write(entry)); set_pte_at_notify(mm, vmf->address, vmf->pte, entry); - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); if (old_folio) { /* * Only after switching the pte to the new page may @@ -4046,7 +4046,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4170,7 +4170,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); unlock: if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4859,7 +4859,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) if (writable) pte = pte_mkwrite(pte); ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vma, vmf->address, vmf->pte, 1); pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -5030,7 +5030,8 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) entry = pte_mkyoung(entry); if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, vmf->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vmf->vma, vmf->address, vmf->pte); + update_mmu_cache_range(vmf, vmf->vma, vmf->address, + vmf->pte, 1); } else { /* Skip spurious TLB flush for retried page fault */ if (vmf->flags & FAULT_FLAG_TRIED) From 00de2c9f26b15f1a6f2af516dd8ec5f8d28189b7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 10 Aug 2023 09:32:41 +0000 Subject: [PATCH 468/489] arm64: mm: use ptep_clear() instead of pte_clear() in clear_flush() In clear_flush(), the original pte may be a present entry, so we should use ptep_clear() to let page_table_check track the pte clearing operation, otherwise it may cause false positive in subsequent set_pte_at(). Link: https://lkml.kernel.org/r/20230810093241.1181142-1-qi.zheng@linux.dev Fixes: 42b2547137f5 ("arm64/mm: enable ARCH_SUPPORTS_PAGE_TABLE_CHECK") Signed-off-by: Qi Zheng Acked-by: Will Deacon Cc: Catalin Marinas Cc: Kefeng Wang Cc: Muchun Song Cc: Pasha Tatashin Cc: Qi Zheng Signed-off-by: Andrew Morton --- arch/arm64/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 21716c9406821e..9c52718ea7509a 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -236,7 +236,7 @@ static void clear_flush(struct mm_struct *mm, unsigned long i, saddr = addr; for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) - pte_clear(mm, addr, ptep); + ptep_clear(mm, addr, ptep); flush_tlb_range(&vma, saddr, addr); } From 004a9a38e20da6eb88fbfb7eeff0f3dd8a01776a Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 15 Aug 2023 23:22:16 +0200 Subject: [PATCH 469/489] mm: userfaultfd: remove stale comment about core dump locking Since commit 7f3bfab52cab ("mm/gup: take mmap_lock in get_dump_page()"), which landed in v5.10, core dumping doesn't enter fault handling without holding the mmap_lock anymore. Remove the stale parts of the comments, but leave the behavior as-is - letting core dumping block on userfault handling would be a bad idea and could lead to deadlocks if the dumping process was handling its own userfaults. Link: https://lkml.kernel.org/r/20230815212216.264445-1-jannh@google.com Signed-off-by: Jann Horn Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1091cb46174743..56eaae9dac1ab2 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -428,15 +428,11 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with * the no_page_table() helper in follow_page_mask(), but the * shmem_vm_ops->fault method is invoked even during - * coredumping without mmap_lock and it ends up here. + * coredumping and it ends up here. */ if (current->flags & (PF_EXITING|PF_DUMPCORE)) goto out; - /* - * Coredumping runs without mmap_lock so we can only check that - * the mmap_lock is held, if PF_DUMPCORE was not set. - */ assert_fault_locked(vmf); ctx = vma->vm_userfaultfd_ctx.ctx; From 7131fd7e30b20c3de03ca5d674546675dfffce5f Mon Sep 17 00:00:00 2001 From: Lucas Karpinski Date: Thu, 17 Aug 2023 15:57:48 -0400 Subject: [PATCH 470/489] selftests: cgroup: fix test_kmem_memcg_deletion kernel mem check Currently, not all kernel memory usage is being accounted for. This commit switches to using the kernel entry within memory.stat which already includes kernel_stack, pagetables, and slab. The kernel entry also includes vmalloc and other additional kernel memory use cases which were missing. Link: https://lkml.kernel.org/r/bvrhe2tpsts2azaroq4ubp2slawmop6orndsswrewuscw3ugvk@kmemmrttsnc7 Signed-off-by: Lucas Karpinski Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Shuah Khan Cc: Tejun Heo Cc: Zefan Li Signed-off-by: Andrew Morton --- tools/testing/selftests/cgroup/test_kmem.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/tools/testing/selftests/cgroup/test_kmem.c b/tools/testing/selftests/cgroup/test_kmem.c index ed2e50bb1e762f..c82f974b85c94d 100644 --- a/tools/testing/selftests/cgroup/test_kmem.c +++ b/tools/testing/selftests/cgroup/test_kmem.c @@ -162,11 +162,11 @@ static int cg_run_in_subcgroups(const char *parent, * allocates some slab memory (mostly negative dentries) using 2 * NR_CPUS * threads. Then it checks the sanity of numbers on the parent level: * the total size of the cgroups should be roughly equal to - * anon + file + slab + kernel_stack. + * anon + file + kernel + sock. */ static int test_kmem_memcg_deletion(const char *root) { - long current, slab, anon, file, kernel_stack, pagetables, percpu, sock, sum; + long current, anon, file, kernel, sock, sum; int ret = KSFT_FAIL; char *parent; @@ -184,29 +184,22 @@ static int test_kmem_memcg_deletion(const char *root) goto cleanup; current = cg_read_long(parent, "memory.current"); - slab = cg_read_key_long(parent, "memory.stat", "slab "); anon = cg_read_key_long(parent, "memory.stat", "anon "); file = cg_read_key_long(parent, "memory.stat", "file "); - kernel_stack = cg_read_key_long(parent, "memory.stat", "kernel_stack "); - pagetables = cg_read_key_long(parent, "memory.stat", "pagetables "); - percpu = cg_read_key_long(parent, "memory.stat", "percpu "); + kernel = cg_read_key_long(parent, "memory.stat", "kernel "); sock = cg_read_key_long(parent, "memory.stat", "sock "); - if (current < 0 || slab < 0 || anon < 0 || file < 0 || - kernel_stack < 0 || pagetables < 0 || percpu < 0 || sock < 0) + if (current < 0 || anon < 0 || file < 0 || kernel < 0 || sock < 0) goto cleanup; - sum = slab + anon + file + kernel_stack + pagetables + percpu + sock; + sum = anon + file + kernel + sock; if (abs(sum - current) < MAX_VMSTAT_ERROR) { ret = KSFT_PASS; } else { printf("memory.current = %ld\n", current); - printf("slab + anon + file + kernel_stack = %ld\n", sum); - printf("slab = %ld\n", slab); + printf("anon + file + kernel + sock = %ld\n", sum); printf("anon = %ld\n", anon); printf("file = %ld\n", file); - printf("kernel_stack = %ld\n", kernel_stack); - printf("pagetables = %ld\n", pagetables); - printf("percpu = %ld\n", percpu); + printf("kernel = %ld\n", kernel); printf("sock = %ld\n", sock); } From bad5a3a42a31859705baf39c4fc92173b2716760 Mon Sep 17 00:00:00 2001 From: Anh Tuan Phan Date: Thu, 17 Aug 2023 23:00:33 +0700 Subject: [PATCH 471/489] selftests/mm: fix WARNING comparing pointer to 0 Remove comparing pointer to 0 to avoid this warning from coccinelle: ./tools/testing/selftests/mm/map_populate.c:80:16-17: WARNING comparing pointer to 0, suggest !E ./tools/testing/selftests/mm/map_populate.c:80:16-17: WARNING comparing pointer to 0 Link: https://lkml.kernel.org/r/20230817160033.90079-1-tuananhlfc@gmail.com Signed-off-by: Anh Tuan Phan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/map_populate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/map_populate.c b/tools/testing/selftests/mm/map_populate.c index 240f2d9dae7a99..7945d07548751b 100644 --- a/tools/testing/selftests/mm/map_populate.c +++ b/tools/testing/selftests/mm/map_populate.c @@ -77,7 +77,7 @@ int main(int argc, char **argv) unsigned long *smap; ftmp = tmpfile(); - BUG_ON(ftmp == 0, "tmpfile()"); + BUG_ON(!ftmp, "tmpfile()"); ret = ftruncate(fileno(ftmp), MMAP_SZ); BUG_ON(ret, "ftruncate()"); From cfeed8ffe55b37fa10286aaaa1369da00cb88440 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:46 +0200 Subject: [PATCH 472/489] mm/swap: stop using page->private on tail pages for THP_SWAP Patch series "mm/swap: stop using page->private on tail pages for THP_SWAP + cleanups". This series stops using page->private on tail pages for THP_SWAP, replaces folio->private by folio->swap for swapcache folios, and starts using "new_folio" for tail pages that we are splitting to remove the usage of page->private for swapcache handling completely. This patch (of 4): Let's stop using page->private on tail pages, making it possible to just unconditionally reuse that field in the tail pages of large folios. The remaining usage of the private field for THP_SWAP is in the THP splitting code (mm/huge_memory.c), that we'll handle separately later. Update the THP_SWAP documentation and sanity checks in mm_types.h and __split_huge_page_tail(). [david@redhat.com: stop using page->private on tail pages for THP_SWAP] Link: https://lkml.kernel.org/r/6f0a82a3-6948-20d9-580b-be1dbf415701@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-1-david@redhat.com Link: https://lkml.kernel.org/r/20230821160849.531668-2-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Catalin Marinas [arm64] Reviewed-by: Yosry Ahmed Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/mm/mteswap.c | 5 +++-- include/linux/mm_types.h | 12 +----------- include/linux/swap.h | 9 +++++++++ mm/huge_memory.c | 15 ++++++--------- mm/memory.c | 2 +- mm/rmap.c | 2 +- mm/swap_state.c | 5 +++-- mm/swapfile.c | 4 ++-- 8 files changed, 26 insertions(+), 28 deletions(-) diff --git a/arch/arm64/mm/mteswap.c b/arch/arm64/mm/mteswap.c index cd508ba80ab1ba..a31833e3ddc544 100644 --- a/arch/arm64/mm/mteswap.c +++ b/arch/arm64/mm/mteswap.c @@ -33,8 +33,9 @@ int mte_save_tags(struct page *page) mte_save_page_tags(page_address(page), tag_storage); - /* page_private contains the swap entry.val set in do_swap_page */ - ret = xa_store(&mte_pages, page_private(page), tag_storage, GFP_KERNEL); + /* lookup the swap entry.val from the page */ + ret = xa_store(&mte_pages, page_swap_entry(page).val, tag_storage, + GFP_KERNEL); if (WARN(xa_is_err(ret), "Failed to store MTE tags")) { mte_free_tag_storage(tag_storage); return xa_err(ret); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 2b9d8be28361e6..55cd4bc57b8df3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,11 +322,8 @@ struct folio { atomic_t _pincount; #ifdef CONFIG_64BIT unsigned int _folio_nr_pages; - /* 4 byte gap here */ - /* private: the union with struct page is transitional */ - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_1; #endif + /* private: the union with struct page is transitional */ }; struct page __page_1; }; @@ -347,9 +344,6 @@ struct folio { /* public: */ struct list_head _deferred_list; /* private: the union with struct page is transitional */ - unsigned long _avail_2a; - /* Fix THP_SWAP to not use tail->private */ - unsigned long _private_2a; }; struct page __page_2; }; @@ -374,9 +368,6 @@ FOLIO_MATCH(memcg_data, memcg_data); offsetof(struct page, pg) + sizeof(struct page)) FOLIO_MATCH(flags, _flags_1); FOLIO_MATCH(compound_head, _head_1); -#ifdef CONFIG_64BIT -FOLIO_MATCH(private, _private_1); -#endif #undef FOLIO_MATCH #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct folio, fl) == \ @@ -385,7 +376,6 @@ FOLIO_MATCH(flags, _flags_2); FOLIO_MATCH(compound_head, _head_2); FOLIO_MATCH(flags, _flags_2a); FOLIO_MATCH(compound_head, _head_2a); -FOLIO_MATCH(private, _private_2a); #undef FOLIO_MATCH /** diff --git a/include/linux/swap.h b/include/linux/swap.h index bb5adc6041448a..e5cf58a1cf9e62 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -339,6 +339,15 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline swp_entry_t page_swap_entry(struct page *page) +{ + struct folio *folio = page_folio(page); + swp_entry_t entry = folio_swap_entry(folio); + + entry.val += folio_page_idx(folio, page); + return entry; +} + static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) { folio->private = (void *)entry.val; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cb4432792b88b4..a28e9fe1658548 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2446,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail, page_tail->index = head->index + tail; /* - * page->private should not be set in tail pages with the exception - * of swap cache pages that store the swp_entry_t in tail pages. - * Fix up and warn once if private is unexpectedly set. - * - * What of 32-bit systems, on which folio->_pincount overlays - * head[1].private? No problem: THP_SWAP is not enabled on 32-bit, and - * pincount must be 0 for folio_ref_freeze() to have succeeded. + * page->private should not be set in tail pages. Fix up and warn once + * if private is unexpectedly set. */ - if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); + if (unlikely(page_tail->private)) { + VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } + if (PageSwapCache(head)) + set_page_private(page_tail, (unsigned long)head->private + tail); /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); diff --git a/mm/memory.c b/mm/memory.c index 9d7fb721a680a0..d104a38e8545a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3879,7 +3879,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * changed. */ if (unlikely(!folio_test_swapcache(folio) || - page_private(page) != entry.val)) + page_swap_entry(page).val != entry.val)) goto out_page; /* diff --git a/mm/rmap.c b/mm/rmap.c index 1f04debdc87a03..ec7f8e6c9e483a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1647,7 +1647,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, */ dec_mm_counter(mm, mm_counter(&folio->page)); } else if (folio_test_anon(folio)) { - swp_entry_t entry = { .val = page_private(subpage) }; + swp_entry_t entry = page_swap_entry(subpage); pte_t swp_pte; /* * Store the swap location in the pte. diff --git a/mm/swap_state.c b/mm/swap_state.c index 01f15139b7d9e5..2f24178100520b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -100,6 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, folio_ref_add(folio, nr); folio_set_swapcache(folio); + folio_set_swap_entry(folio, entry); do { xas_lock_irq(&xas); @@ -113,7 +114,6 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, if (shadowp) *shadowp = old; } - set_page_private(folio_page(folio, i), entry.val + i); xas_store(&xas, folio); xas_next(&xas); } @@ -154,9 +154,10 @@ void __delete_from_swap_cache(struct folio *folio, for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != folio, entry); - set_page_private(folio_page(folio, i), 0); xas_next(&xas); } + entry.val = 0; + folio_set_swap_entry(folio, entry); folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); diff --git a/mm/swapfile.c b/mm/swapfile.c index d46933adf789f5..bd9d904671b947 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3369,7 +3369,7 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry) struct swap_info_struct *page_swap_info(struct page *page) { - swp_entry_t entry = { .val = page_private(page) }; + swp_entry_t entry = page_swap_entry(page); return swp_swap_info(entry); } @@ -3384,7 +3384,7 @@ EXPORT_SYMBOL_GPL(swapcache_mapping); pgoff_t __page_file_index(struct page *page) { - swp_entry_t swap = { .val = page_private(page) }; + swp_entry_t swap = page_swap_entry(page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); From 85a1333417a7561c1d10a77d6c873a37e6ea63a0 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 21 Aug 2023 18:08:47 +0200 Subject: [PATCH 473/489] mm/swap: use dedicated entry for swap in folio Let's stop working on the private field and use an explicit swap field. We have to move the swp_entry_t typedef. Link: https://lkml.kernel.org/r/20230821160849.531668-3-david@redhat.com Signed-off-by: Matthew Wilcox Signed-off-by: David Hildenbrand Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 23 +++++++++++++---------- include/linux/swap.h | 5 ++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 55cd4bc57b8df3..36c5b43999e608 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -248,6 +248,14 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); } +/* + * A swap entry has to fit into a "unsigned long", as the entry is hidden + * in the "index" field of the swapper address space. + */ +typedef struct { + unsigned long val; +} swp_entry_t; + /** * struct folio - Represents a contiguous set of bytes. * @flags: Identical to the page flags. @@ -258,7 +266,7 @@ static inline struct page *encoded_page_ptr(struct encoded_page *page) * @index: Offset within the file, in units of pages. For anonymous memory, * this is the index from the beginning of the mmap. * @private: Filesystem per-folio data (see folio_attach_private()). - * Used for swp_entry_t if folio_test_swapcache(). + * @swap: Used for swp_entry_t if folio_test_swapcache(). * @_mapcount: Do not access this member directly. Use folio_mapcount() to * find out how many times this folio is mapped by userspace. * @_refcount: Do not access this member directly. Use folio_ref_count() @@ -301,7 +309,10 @@ struct folio { }; struct address_space *mapping; pgoff_t index; - void *private; + union { + void *private; + swp_entry_t swap; + }; atomic_t _mapcount; atomic_t _refcount; #ifdef CONFIG_MEMCG @@ -1209,14 +1220,6 @@ enum tlb_flush_reason { NR_TLB_FLUSH_REASONS, }; - /* - * A swap entry has to fit into a "unsigned long", as the entry is hidden - * in the "index" field of the swapper address space. - */ -typedef struct { - unsigned long val; -} swp_entry_t; - /** * enum fault_flag - Fault flag definitions. * @FAULT_FLAG_WRITE: Fault was a write fault. diff --git a/include/linux/swap.h b/include/linux/swap.h index e5cf58a1cf9e62..352eca0a75bc6a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -335,8 +335,7 @@ struct swap_info_struct { static inline swp_entry_t folio_swap_entry(struct folio *folio) { - swp_entry_t entry = { .val = page_private(&folio->page) }; - return entry; + return folio->swap; } static inline swp_entry_t page_swap_entry(struct page *page) @@ -350,7 +349,7 @@ static inline swp_entry_t page_swap_entry(struct page *page) static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) { - folio->private = (void *)entry.val; + folio->swap = entry; } /* linux/mm/workingset.c */ From 3d2c908768877714a354ee6d7bf93e801400d5e2 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:48 +0200 Subject: [PATCH 474/489] mm/swap: inline folio_set_swap_entry() and folio_swap_entry() Let's simply work on the folio directly and remove the helpers. Link: https://lkml.kernel.org/r/20230821160849.531668-4-david@redhat.com Signed-off-by: David Hildenbrand Suggested-by: Matthew Wilcox Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/swap.h | 12 +----------- mm/memory.c | 2 +- mm/shmem.c | 6 +++--- mm/swap_state.c | 7 +++---- mm/swapfile.c | 2 +- mm/util.c | 2 +- mm/vmscan.c | 2 +- mm/zswap.c | 4 ++-- 8 files changed, 13 insertions(+), 24 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 352eca0a75bc6a..493487ed7c388b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -333,25 +333,15 @@ struct swap_info_struct { */ }; -static inline swp_entry_t folio_swap_entry(struct folio *folio) -{ - return folio->swap; -} - static inline swp_entry_t page_swap_entry(struct page *page) { struct folio *folio = page_folio(page); - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; entry.val += folio_page_idx(folio, page); return entry; } -static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) -{ - folio->swap = entry; -} - /* linux/mm/workingset.c */ bool workingset_test_recent(void *shadow, bool file, bool *workingset); void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); diff --git a/mm/memory.c b/mm/memory.c index d104a38e8545a7..421fcef3a3e7fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3828,7 +3828,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) folio_add_lru(folio); /* To provide entry to swap_readpage() */ - folio_set_swap_entry(folio, entry); + folio->swap = entry; swap_readpage(page, true, NULL); folio->private = NULL; } diff --git a/mm/shmem.c b/mm/shmem.c index 99fb60ec2c3d53..980289be5f6351 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1642,7 +1642,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, int error; old = *foliop; - entry = folio_swap_entry(old); + entry = old->swap; swap_index = swp_offset(entry); swap_mapping = swap_address_space(entry); @@ -1663,7 +1663,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, __folio_set_locked(new); __folio_set_swapbacked(new); folio_mark_uptodate(new); - folio_set_swap_entry(new, entry); + new->swap = entry; folio_set_swapcache(new); /* @@ -1785,7 +1785,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, /* We have to do this with folio locked to prevent races */ folio_lock(folio); if (!folio_test_swapcache(folio) || - folio_swap_entry(folio).val != swap.val || + folio->swap.val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; goto unlock; diff --git a/mm/swap_state.c b/mm/swap_state.c index 2f24178100520b..b3b14bd0dd6447 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -100,7 +100,7 @@ int add_to_swap_cache(struct folio *folio, swp_entry_t entry, folio_ref_add(folio, nr); folio_set_swapcache(folio); - folio_set_swap_entry(folio, entry); + folio->swap = entry; do { xas_lock_irq(&xas); @@ -156,8 +156,7 @@ void __delete_from_swap_cache(struct folio *folio, VM_BUG_ON_PAGE(entry != folio, entry); xas_next(&xas); } - entry.val = 0; - folio_set_swap_entry(folio, entry); + folio->swap.val = 0; folio_clear_swapcache(folio); address_space->nrpages -= nr; __node_stat_mod_folio(folio, NR_FILE_PAGES, -nr); @@ -233,7 +232,7 @@ bool add_to_swap(struct folio *folio) */ void delete_from_swap_cache(struct folio *folio) { - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); diff --git a/mm/swapfile.c b/mm/swapfile.c index bd9d904671b947..e52f486834ebf7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1536,7 +1536,7 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, static bool folio_swapped(struct folio *folio) { - swp_entry_t entry = folio_swap_entry(folio); + swp_entry_t entry = folio->swap; struct swap_info_struct *si = _swap_info_get(entry); if (!si) diff --git a/mm/util.c b/mm/util.c index cde229b05eb351..f31e2ca62cfae8 100644 --- a/mm/util.c +++ b/mm/util.c @@ -764,7 +764,7 @@ struct address_space *folio_mapping(struct folio *folio) return NULL; if (unlikely(folio_test_swapcache(folio))) - return swap_address_space(folio_swap_entry(folio)); + return swap_address_space(folio->swap); mapping = folio->mapping; if ((unsigned long)mapping & PAGE_MAPPING_FLAGS) diff --git a/mm/vmscan.c b/mm/vmscan.c index c7c149cb8d6628..6f13394b112eae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1423,7 +1423,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, } if (folio_test_swapcache(folio)) { - swp_entry_t swap = folio_swap_entry(folio); + swp_entry_t swap = folio->swap; if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); diff --git a/mm/zswap.c b/mm/zswap.c index 7300b98d4a03bd..412b1409a0d78a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1190,7 +1190,7 @@ static void zswap_fill_page(void *ptr, unsigned long value) bool zswap_store(struct folio *folio) { - swp_entry_t swp = folio_swap_entry(folio); + swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; @@ -1370,7 +1370,7 @@ bool zswap_store(struct folio *folio) bool zswap_load(struct folio *folio) { - swp_entry_t swp = folio_swap_entry(folio); + swp_entry_t swp = folio->swap; int type = swp_type(swp); pgoff_t offset = swp_offset(swp); struct page *page = &folio->page; From 07e09c483cbef2a252f75d95670755a0607288f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 21 Aug 2023 18:08:49 +0200 Subject: [PATCH 475/489] mm/huge_memory: work on folio->swap instead of page->private when splitting folio Let's work on folio->swap instead. While at it, use folio_test_anon() and folio_test_swapcache() -- the original folio remains valid even after splitting (but is then an order-0 folio). We can probably convert a lot more to folios in that code, let's focus on folio->swap handling only for now. Link: https://lkml.kernel.org/r/20230821160849.531668-5-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Chris Li Cc: Catalin Marinas Cc: Dan Streetman Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Peter Xu Cc: Seth Jennings Cc: Vitaly Wool Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/huge_memory.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a28e9fe1658548..c4635f750255b1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2401,10 +2401,16 @@ static void lru_add_page_tail(struct page *head, struct page *tail, } } -static void __split_huge_page_tail(struct page *head, int tail, +static void __split_huge_page_tail(struct folio *folio, int tail, struct lruvec *lruvec, struct list_head *list) { + struct page *head = &folio->page; struct page *page_tail = head + tail; + /* + * Careful: new_folio is not a "real" folio before we cleared PageTail. + * Don't pass it around before clear_compound_head(). + */ + struct folio *new_folio = (struct folio *)page_tail; VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail); @@ -2453,8 +2459,8 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_WARN_ON_ONCE_PAGE(true, page_tail); page_tail->private = 0; } - if (PageSwapCache(head)) - set_page_private(page_tail, (unsigned long)head->private + tail); + if (folio_test_swapcache(folio)) + new_folio->swap.val = folio->swap.val + tail; /* Page flags must be visible before we make the page non-compound. */ smp_wmb(); @@ -2500,11 +2506,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* complete memcg works before add pages to LRU */ split_page_memcg(head, nr); - if (PageAnon(head) && PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - offset = swp_offset(entry); - swap_cache = swap_address_space(entry); + if (folio_test_anon(folio) && folio_test_swapcache(folio)) { + offset = swp_offset(folio->swap); + swap_cache = swap_address_space(folio->swap); xa_lock(&swap_cache->i_pages); } @@ -2514,7 +2518,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); for (i = nr - 1; i >= 1; i--) { - __split_huge_page_tail(head, i, lruvec, list); + __split_huge_page_tail(folio, i, lruvec, list); /* Some pages can be beyond EOF: drop them from page cache */ if (head[i].index >= end) { struct folio *tail = page_folio(head + i); @@ -2559,11 +2563,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, remap_page(folio, nr); - if (PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - split_swap_cluster(entry); - } + if (folio_test_swapcache(folio)) + split_swap_cluster(folio->swap); for (i = 0; i < nr; i++) { struct page *subpage = head + i; From 14a405c3a933ce261147a60dfb8bf586b45ec9de Mon Sep 17 00:00:00 2001 From: Lu Jialin Date: Sat, 19 Aug 2023 08:13:02 +0000 Subject: [PATCH 476/489] memcg: remove duplication detection for mem_cgroup_uncharge_swap __mem_cgroup_uncharge_swap is only called in mem_cgroup_uncharge_swap, if mem cgroup is disabled, __mem_cgroup_uncharge_swap cannot be called. Therefore, there is no need to judge whether mem_cgroup is disabled or not. Link: https://lkml.kernel.org/r/20230819081302.1217098-1-lujialin4@huawei.com Signed-off-by: Lu Jialin Acked-by: Shakeel Butt Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf57fe9318d55e..0286bd0ab043d2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7535,9 +7535,6 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; - if (mem_cgroup_disabled()) - return; - id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); From bb7dbaafff3f582d18028a5b99a8faa789842678 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 19 Aug 2023 04:18:37 +0100 Subject: [PATCH 477/489] mm: remove checks for pte_index Since pte_index is always defined, we don't need to check whether it's defined or not. Delete the slow version that doesn't depend on it and remove the #define since nobody needs to test for it. Link: https://lkml.kernel.org/r/20230819031837.3160096-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Cc: Christian Dietrich Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 1 - mm/memory.c | 17 +---------------- 2 files changed, 1 insertion(+), 17 deletions(-) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index fc811c9b421ab8..95ad544ad39527 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -63,7 +63,6 @@ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } -#define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) diff --git a/mm/memory.c b/mm/memory.c index 421fcef3a3e7fa..50f44c1bfa195b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1869,7 +1869,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, return retval; } -#ifdef pte_index static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { @@ -1884,7 +1883,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte, } /* insert_pages() amortizes the cost of spinlock operations - * when inserting pages in a loop. Arch *must* define pte_index. + * when inserting pages in a loop. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) @@ -1943,7 +1942,6 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, *num = remaining_pages_total; return ret; } -#endif /* ifdef pte_index */ /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. @@ -1963,7 +1961,6 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { -#ifdef pte_index const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) @@ -1975,18 +1972,6 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); -#else - unsigned long idx = 0, pgcount = *num; - int err = -EINVAL; - - for (; idx < pgcount; ++idx) { - err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); - if (err) - break; - } - *num = pgcount - idx; - return err; -#endif /* ifdef pte_index */ } EXPORT_SYMBOL(vm_insert_pages); From 051ddcfeb1bdbae45e660c0db2468d29ca15c6c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:33 +0100 Subject: [PATCH 478/489] mm: move PMD_ORDER to pgtable.h Patch series "Change calling convention for ->huge_fault", v2. There are two unrelated changes to the calling convention for ->huge_fault. I've bundled them together to help people notice the change. The first is to improve scalability of DAX page faults by allowing them to be handled under the VMA lock. The second is to remove enum page_entry_size since it's really unnecessary. The changelogs and documentation updates hopefully work to that end. This patch (of 3): Allow this to be used in generic code. Also add PUD_ORDER. Link: https://lkml.kernel.org/r/20230818202335.2739663-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230818202335.2739663-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- fs/dax.c | 3 --- include/linux/pgtable.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 906ecbd541a3de..88bb13643117ec 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -49,9 +49,6 @@ static inline unsigned int pe_order(enum page_entry_size pe_size) #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) -/* The order of a PMD entry */ -#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) - static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 95ad544ad39527..f49abcfe5eda1a 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -5,6 +5,9 @@ #include #include +#define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) +#define PUD_ORDER (PUD_SHIFT - PAGE_SHIFT) + #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU From 40d49a3c9e4a0e5cf7a6fcebc8d4d7d63d1f3f1b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:34 +0100 Subject: [PATCH 479/489] mm: allow ->huge_fault() to be called without the mmap_lock held Remove the checks for the VMA lock being held, allowing the page fault path to call into the filesystem instead of retrying with the mmap_lock held. This will improve scalability for DAX page faults. Also update the documentation to match (and fix some other changes that have happened recently). Link: https://lkml.kernel.org/r/20230818202335.2739663-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/filesystems/locking.rst | 36 +++++++++++++++++---------- Documentation/filesystems/porting.rst | 11 ++++++++ mm/memory.c | 22 ++-------------- 3 files changed, 36 insertions(+), 33 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 211a0305399223..1a2cb60b24992c 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -628,26 +628,29 @@ vm_operations_struct prototypes:: - void (*open)(struct vm_area_struct*); - void (*close)(struct vm_area_struct*); - vm_fault_t (*fault)(struct vm_area_struct*, struct vm_fault *); + void (*open)(struct vm_area_struct *); + void (*close)(struct vm_area_struct *); + vm_fault_t (*fault)(struct vm_fault *); + vm_fault_t (*huge_fault)(struct vm_fault *, unsigned int order); + vm_fault_t (*map_pages)(struct vm_fault *, pgoff_t start, pgoff_t end); vm_fault_t (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); vm_fault_t (*pfn_mkwrite)(struct vm_area_struct *, struct vm_fault *); int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: -============= ========= =========================== +============= ========== =========================== ops mmap_lock PageLocked(page) -============= ========= =========================== -open: yes -close: yes -fault: yes can return with page locked -map_pages: read -page_mkwrite: yes can return with page locked -pfn_mkwrite: yes -access: yes -============= ========= =========================== +============= ========== =========================== +open: write +close: read/write +fault: read can return with page locked +huge_fault: maybe-read +map_pages: maybe-read +page_mkwrite: read can return with page locked +pfn_mkwrite: read +access: read +============= ========== =========================== ->fault() is called when a previously not present pte is about to be faulted in. The filesystem must find and return the page associated with the passed in @@ -657,6 +660,13 @@ then ensure the page is not already truncated (invalidate_lock will block subsequent truncate), and then return with VM_FAULT_LOCKED, and the page locked. The VM will unlock the page. +->huge_fault() is called when there is no PUD or PMD entry present. This +gives the filesystem the opportunity to install a PUD or PMD sized page. +Filesystems can also use the ->fault method to return a PMD sized page, +so implementing this function may not be necessary. In particular, +filesystems should not call filemap_fault() from ->huge_fault(). +The mmap_lock may not be held when this method is called. + ->map_pages() is called when VM asks to map easy accessible pages. Filesystem should find and map pages associated with offsets from "start_pgoff" till "end_pgoff". ->map_pages() is called with the RCU lock held and must diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index d2d684ae77984f..7ce352265de103 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -943,3 +943,14 @@ file pointer instead of struct dentry pointer. d_tmpfile() is similarly changed to simplify callers. The passed file is in a non-open state and on success must be opened before returning (e.g. by calling finish_open_simple()). + +--- + +**mandatory** + +Calling convention for ->huge_fault has changed. It now takes a page +order instead of an enum page_entry_size, and it may be called without the +mmap_lock held. All in-tree users have been audited and do not seem to +depend on the mmap_lock being held, but out of tree users should verify +for themselves. If they do need it, they can return VM_FAULT_RETRY to +be called with the mmap_lock held. diff --git a/mm/memory.c b/mm/memory.c index 50f44c1bfa195b..7a7e58729510df 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4854,13 +4854,8 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); - if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); - } return VM_FAULT_FALLBACK; } @@ -4880,10 +4875,6 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -4904,13 +4895,8 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) /* No support for anonymous transparent PUD pages yet */ if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; - if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } + if (vma->vm_ops->huge_fault) return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); - } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -4927,10 +4913,6 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - if (vmf->flags & FAULT_FLAG_VMA_LOCK) { - vma_end_read(vma); - return VM_FAULT_RETRY; - } ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); if (!(ret & VM_FAULT_FALLBACK)) return ret; From 1d024e7a8dabcc3c84d77532a88c774c32cf8245 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:23:35 +0100 Subject: [PATCH 480/489] mm: remove enum page_entry_size Remove the unnecessary encoding of page order into an enum and pass the page order directly. That lets us get rid of pe_order(). The switch constructs have to be changed to if/else constructs to prevent GCC from warning on builds with 3-level page tables where PMD_ORDER and PUD_ORDER have the same value. If you are looking at this commit because your driver stopped compiling, look at the previous commit as well and audit your driver to be sure it doesn't depend on mmap_lock being held in its ->huge_fault method. [willy@infradead.org: use "order %u" to match the (non dev_t) style] Link: https://lkml.kernel.org/r/ZOUYekbtTv+n8hYf@casper.infradead.org Link: https://lkml.kernel.org/r/20230818202335.2739663-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- drivers/dax/device.c | 22 ++++++++-------------- fs/dax.c | 30 ++++++++---------------------- fs/erofs/data.c | 6 +++--- fs/ext2/file.c | 2 +- fs/ext4/file.c | 11 +++++------ fs/fuse/dax.c | 20 +++++++++----------- fs/xfs/xfs_file.c | 24 ++++++++++++------------ fs/xfs/xfs_trace.h | 20 ++++++-------------- include/linux/dax.h | 4 ++-- include/linux/mm.h | 10 +--------- mm/memory.c | 8 ++++---- 11 files changed, 59 insertions(+), 98 deletions(-) diff --git a/drivers/dax/device.c b/drivers/dax/device.c index 30665a3ff6ea36..93ebedc5ec8ca3 100644 --- a/drivers/dax/device.c +++ b/drivers/dax/device.c @@ -228,32 +228,26 @@ static vm_fault_t __dev_dax_pud_fault(struct dev_dax *dev_dax, } #endif /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ -static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { struct file *filp = vmf->vma->vm_file; vm_fault_t rc = VM_FAULT_SIGBUS; int id; struct dev_dax *dev_dax = filp->private_data; - dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) size = %d\n", current->comm, + dev_dbg(&dev_dax->dev, "%s: %s (%#lx - %#lx) order:%d\n", current->comm, (vmf->flags & FAULT_FLAG_WRITE) ? "write" : "read", - vmf->vma->vm_start, vmf->vma->vm_end, pe_size); + vmf->vma->vm_start, vmf->vma->vm_end, order); id = dax_read_lock(); - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) rc = __dev_dax_pte_fault(dev_dax, vmf); - break; - case PE_SIZE_PMD: + else if (order == PMD_ORDER) rc = __dev_dax_pmd_fault(dev_dax, vmf); - break; - case PE_SIZE_PUD: + else if (order == PUD_ORDER) rc = __dev_dax_pud_fault(dev_dax, vmf); - break; - default: + else rc = VM_FAULT_SIGBUS; - } dax_read_unlock(id); @@ -262,7 +256,7 @@ static vm_fault_t dev_dax_huge_fault(struct vm_fault *vmf, static vm_fault_t dev_dax_fault(struct vm_fault *vmf) { - return dev_dax_huge_fault(vmf, PE_SIZE_PTE); + return dev_dax_huge_fault(vmf, 0); } static int dev_dax_may_split(struct vm_area_struct *vma, unsigned long addr) diff --git a/fs/dax.c b/fs/dax.c index 88bb13643117ec..8fafecbe42b159 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -30,17 +30,6 @@ #define CREATE_TRACE_POINTS #include -static inline unsigned int pe_order(enum page_entry_size pe_size) -{ - if (pe_size == PE_SIZE_PTE) - return PAGE_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PMD) - return PMD_SHIFT - PAGE_SHIFT; - if (pe_size == PE_SIZE_PUD) - return PUD_SHIFT - PAGE_SHIFT; - return ~0; -} - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) @@ -1905,7 +1894,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @pe_size: Size of the page to fault in + * @order: Order of the page to fault in * @pfnp: PFN to insert for synchronous faults if fsync is required * @iomap_errp: Storage for detailed error code in case of error * @ops: Iomap ops passed from the file system @@ -1915,17 +1904,15 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - switch (pe_size) { - case PE_SIZE_PTE: + if (order == 0) return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); - case PE_SIZE_PMD: + else if (order == PMD_ORDER) return dax_iomap_pmd_fault(vmf, pfnp, ops); - default: + else return VM_FAULT_FALLBACK; - } } EXPORT_SYMBOL_GPL(dax_iomap_fault); @@ -1976,19 +1963,18 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /** * dax_finish_sync_fault - finish synchronous page fault * @vmf: The description of the fault - * @pe_size: Size of entry to be inserted + * @order: Order of entry to be inserted * @pfn: PFN to insert * * This function ensures that the file range touched by the page fault is * stored persistently on the media and handles inserting of appropriate page * table entry. */ -vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, + pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; - unsigned int order = pe_order(pe_size); size_t len = PAGE_SIZE << order; err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); diff --git a/fs/erofs/data.c b/fs/erofs/data.c index db5e4b7636ec0f..0c2c99c58b5e3a 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -413,14 +413,14 @@ const struct address_space_operations erofs_raw_access_aops = { #ifdef CONFIG_FS_DAX static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { - return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); + return dax_iomap_fault(vmf, order, NULL, NULL, &erofs_iomap_ops); } static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); + return erofs_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct erofs_dax_vm_ops = { diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 0b4c91c62e1f5e..1039e5bf90afd3 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -103,7 +103,7 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) } filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, 0, NULL, NULL, &ext2_iomap_ops); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index c457c8517f0fde..2dc3f8301225a9 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -723,8 +723,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } #ifdef CONFIG_FS_DAX -static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; @@ -740,7 +739,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is - * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ @@ -764,7 +763,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, } else { filemap_invalidate_lock_shared(mapping); } - result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); + result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); @@ -773,7 +772,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) - result = dax_finish_sync_fault(vmf, pe_size, pfn); + result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { @@ -785,7 +784,7 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { - return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); + return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 8e74f278a3f694..23904a6a9a96f7 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -784,8 +784,8 @@ static int fuse_dax_writepages(struct address_space *mapping, return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); } -static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, bool write) +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, unsigned int order, + bool write) { vm_fault_t ret; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -809,7 +809,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, * to populate page cache or access memory we are trying to free. */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + ret = dax_iomap_fault(vmf, order, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; @@ -818,7 +818,7 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, } if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(inode->i_mapping); if (write) @@ -829,24 +829,22 @@ static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, - vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, 0, vmf->flags & FAULT_FLAG_WRITE); } -static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, - enum page_entry_size pe_size) +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { - return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); + return __fuse_dax_fault(vmf, order, vmf->flags & FAULT_FLAG_WRITE); } static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) { - return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); + return __fuse_dax_fault(vmf, 0, true); } static const struct vm_operations_struct fuse_dax_vm_ops = { diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 4f502219ae4f13..203700278ddbb6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1287,11 +1287,11 @@ xfs_file_llseek( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { - return dax_iomap_fault(vmf, pe_size, pfn, NULL, + return dax_iomap_fault(vmf, order, pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_dax_write_iomap_ops : &xfs_read_iomap_ops); @@ -1300,7 +1300,7 @@ xfs_dax_fault( static inline vm_fault_t xfs_dax_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault, pfn_t *pfn) { @@ -1322,14 +1322,14 @@ xfs_dax_fault( static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, - enum page_entry_size pe_size, + unsigned int order, bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; - trace_xfs_filemap_fault(ip, pe_size, write_fault); + trace_xfs_filemap_fault(ip, order, write_fault); if (write_fault) { sb_start_pagefault(inode->i_sb); @@ -1340,9 +1340,9 @@ __xfs_filemap_fault( pfn_t pfn; xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); + ret = xfs_dax_fault(vmf, order, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) - ret = dax_finish_sync_fault(vmf, pe_size, pfn); + ret = dax_finish_sync_fault(vmf, order, pfn); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { if (write_fault) { @@ -1373,7 +1373,7 @@ xfs_filemap_fault( struct vm_fault *vmf) { /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + return __xfs_filemap_fault(vmf, 0, IS_DAX(file_inode(vmf->vma->vm_file)) && xfs_is_write_fault(vmf)); } @@ -1381,13 +1381,13 @@ xfs_filemap_fault( static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, - enum page_entry_size pe_size) + unsigned int order) { if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; /* DAX can shortcut the normal fault path on write faults! */ - return __xfs_filemap_fault(vmf, pe_size, + return __xfs_filemap_fault(vmf, order, xfs_is_write_fault(vmf)); } @@ -1395,7 +1395,7 @@ static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } /* @@ -1408,7 +1408,7 @@ xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); + return __xfs_filemap_fault(vmf, 0, true); } static const struct vm_operations_struct xfs_file_vm_ops = { diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index f3cc204bb4bf62..fd789e00dfd604 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -802,36 +802,28 @@ DEFINE_INODE_EVENT(xfs_inode_inactivating); * ring buffer. Somehow this was only worth mentioning in the ftrace sample * code. */ -TRACE_DEFINE_ENUM(PE_SIZE_PTE); -TRACE_DEFINE_ENUM(PE_SIZE_PMD); -TRACE_DEFINE_ENUM(PE_SIZE_PUD); - TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED); TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW); TRACE_EVENT(xfs_filemap_fault, - TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, - bool write_fault), - TP_ARGS(ip, pe_size, write_fault), + TP_PROTO(struct xfs_inode *ip, unsigned int order, bool write_fault), + TP_ARGS(ip, order, write_fault), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) - __field(enum page_entry_size, pe_size) + __field(unsigned int, order) __field(bool, write_fault) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; - __entry->pe_size = pe_size; + __entry->order = order; __entry->write_fault = write_fault; ), - TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + TP_printk("dev %d:%d ino 0x%llx order %u write_fault %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __print_symbolic(__entry->pe_size, - { PE_SIZE_PTE, "PTE" }, - { PE_SIZE_PMD, "PMD" }, - { PE_SIZE_PUD, "PUD" }), + __entry->order, __entry->write_fault) ) diff --git a/include/linux/dax.h b/include/linux/dax.h index 261944ec0887ca..22cd9902345d74 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -241,10 +241,10 @@ void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); -vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, - enum page_entry_size pe_size, pfn_t pfn); + unsigned int order, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/mm.h b/include/linux/mm.h index ddb95967ba64c1..53efddc4d178cc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -532,13 +532,6 @@ struct vm_fault { */ }; -/* page entry size for vm->huge_fault() */ -enum page_entry_size { - PE_SIZE_PTE = 0, - PE_SIZE_PMD, - PE_SIZE_PUD, -}; - /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer @@ -562,8 +555,7 @@ struct vm_operations_struct { int (*mprotect)(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long newflags); vm_fault_t (*fault)(struct vm_fault *vmf); - vm_fault_t (*huge_fault)(struct vm_fault *vmf, - enum page_entry_size pe_size); + vm_fault_t (*huge_fault)(struct vm_fault *vmf, unsigned int order); vm_fault_t (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); diff --git a/mm/memory.c b/mm/memory.c index 7a7e58729510df..00a5ce11309023 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4855,7 +4855,7 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->huge_fault) - return vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + return vma->vm_ops->huge_fault(vmf, PMD_ORDER); return VM_FAULT_FALLBACK; } @@ -4875,7 +4875,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD); + ret = vma->vm_ops->huge_fault(vmf, PMD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } @@ -4896,7 +4896,7 @@ static vm_fault_t create_huge_pud(struct vm_fault *vmf) if (vma_is_anonymous(vma)) return VM_FAULT_FALLBACK; if (vma->vm_ops->huge_fault) - return vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + return vma->vm_ops->huge_fault(vmf, PUD_ORDER); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ return VM_FAULT_FALLBACK; } @@ -4913,7 +4913,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) goto split; if (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) { if (vma->vm_ops->huge_fault) { - ret = vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD); + ret = vma->vm_ops->huge_fault(vmf, PUD_ORDER); if (!(ret & VM_FAULT_FALLBACK)) return ret; } From 19134bc23500a01bfdb77a804fc8e4bf8808d0cc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:27 +0100 Subject: [PATCH 481/489] mm: fix kernel-doc warning from tlb_flush_rmaps() Patch series "Improve mm documentation". If you build with W=1, kernel-doc complains about tlb_flush_rmaps(). Then I ran scripts/find-unused-docs.sh against mm/ and found a large number of files which weren't included in the ReST documentation. I fixed up a couple of them, and added all those without erros to the rst files. There's a lot more work to do to organise all of this, but at least now if we have documentation that refers to these functions, we'll get a nice link to them. This patch (of 4): The vma parameter wasn't described. Link: https://lkml.kernel.org/r/20230818200630.2719595-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230818200630.2719595-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Randy Dunlap Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index ea9683e1293697..4f559f4ddd2171 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -63,6 +63,7 @@ static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_ /** * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB * @tlb: the current mmu_gather + * @vma: The memory area from which the pages are being removed. * * Note that because of how tlb_next_batch() above works, we will * never start multiple new batches with pending delayed rmaps, so From 853f62a30422f1a9a9c1f4b44df9b0de0d46a9e9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:28 +0100 Subject: [PATCH 482/489] mm: fix get_mctgt_type() kernel-doc Convert the return values to an ReST list and tidy up the wording while I'm touching it. [akpm@linux-foundation.org: changes suggested by Randy] [willy@infradead.org: another change suggested by Randy] Link: https://lkml.kernel.org/r/ZOUZtZizeQG7PcsM@casper.infradead.org Link: https://lkml.kernel.org/r/20230818200630.2719595-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0286bd0ab043d2..b29b850cf3994e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5850,25 +5850,20 @@ static int mem_cgroup_move_account(struct page *page, * @ptent: the pte to be checked * @target: the pointer the target page or swap ent will be stored(can be NULL) * - * Returns - * 0(MC_TARGET_NONE): if the pte is not a target for move charge. - * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for - * move charge. if @target is not NULL, the page is stored in target->page - * with extra refcnt got(Callers should handle it). - * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a - * target for charge migration. if @target is not NULL, the entry is stored - * in target->ent. - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and - * thus not on the lru. - * For now we such page is charge like a regular page would be as for all - * intent and purposes it is just special memory taking the place of a - * regular page. - * - * See Documentations/vm/hmm.txt and include/linux/hmm.h - * - * Called with pte lock held. + * Context: Called with pte lock held. + * Return: + * * MC_TARGET_NONE - If the pte is not a target for move charge. + * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for + * move charge. If @target is not NULL, the page is stored in target->page + * with extra refcnt taken (Caller should release it). + * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a + * target for charge migration. If @target is not NULL, the entry is + * stored in target->ent. + * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and + * thus not on the lru. For now such page is charged like a regular page + * would be as it is just special memory taking the place of a regular page. + * See Documentations/vm/hmm.txt and include/linux/hmm.h */ - static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { From 01a7eb3e20994701700631ec30462087c4ecf142 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:29 +0100 Subject: [PATCH 483/489] mm: fix clean_record_shared_mapping_range kernel-doc Turn the a), b) into an unordered ReST list and remove the unnecessary 'Note:' prefix. Link: https://lkml.kernel.org/r/20230818200630.2719595-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Randy Dunlap Acked-by: Mike Rapoport (IBM) Signed-off-by: Andrew Morton --- mm/mapping_dirty_helpers.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index a26dd8bcfcdba3..2f8829b3541a98 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -288,13 +288,14 @@ EXPORT_SYMBOL_GPL(wp_shared_mapping_range); * @end: Pointer to the number of the last set bit in @bitmap. * none set. The value is modified as new bits are set by the function. * - * Note: When this function returns there is no guarantee that a CPU has + * When this function returns there is no guarantee that a CPU has * not already dirtied new ptes. However it will not clean any ptes not * reported in the bitmap. The guarantees are as follows: - * a) All ptes dirty when the function starts executing will end up recorded - * in the bitmap. - * b) All ptes dirtied after that will either remain dirty, be recorded in the - * bitmap or both. + * + * * All ptes dirty when the function starts executing will end up recorded + * in the bitmap. + * * All ptes dirtied after that will either remain dirty, be recorded in the + * bitmap or both. * * If a caller needs to make sure all dirty ptes are picked up and none * additional are added, it first needs to write-protect the address-space From 61ff748b5b7b0c32daddbfb92c3bc15d938754dc Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 18 Aug 2023 21:06:30 +0100 Subject: [PATCH 484/489] mm: add orphaned kernel-doc to the rst files. There are many files in mm/ that contain kernel-doc which is not currently published on kernel.org. Some of it is easily categorisable, but most of it is going into the miscellaneous documentation section to be organised later. Some files aren't ready to be included; they contain documentation with build errors. Or they're nommu.c which duplicates documentation from "real" MMU systems. Those files are noted with a # mark (although really anything which isn't a recognised directive would do to prevent inclusion) Link: https://lkml.kernel.org/r/20230818200630.2719595-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Mike Rapoport (IBM) Cc: Randy Dunlap Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 25 +++++++++++++++++++++++++ Documentation/mm/highmem.rst | 1 + Documentation/mm/zsmalloc.rst | 5 +++++ 3 files changed, 31 insertions(+) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index f5dde5bceaeaf0..2d091c873d1edc 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -115,3 +115,28 @@ More Memory Management Functions .. kernel-doc:: include/linux/mmzone.h .. kernel-doc:: mm/util.c :functions: folio_mapping + +.. kernel-doc:: mm/rmap.c +.. kernel-doc:: mm/migrate.c +.. kernel-doc:: mm/mmap.c +.. kernel-doc:: mm/kmemleak.c +.. #kernel-doc:: mm/hmm.c (build warnings) +.. kernel-doc:: mm/memremap.c +.. kernel-doc:: mm/hugetlb.c +.. kernel-doc:: mm/swap.c +.. kernel-doc:: mm/zpool.c +.. kernel-doc:: mm/memcontrol.c +.. #kernel-doc:: mm/memory-tiers.c (build warnings) +.. kernel-doc:: mm/shmem.c +.. kernel-doc:: mm/migrate_device.c +.. #kernel-doc:: mm/nommu.c (duplicates kernel-doc from other files) +.. kernel-doc:: mm/mapping_dirty_helpers.c +.. #kernel-doc:: mm/memory-failure.c (build warnings) +.. kernel-doc:: mm/percpu.c +.. kernel-doc:: mm/maccess.c +.. kernel-doc:: mm/vmscan.c +.. kernel-doc:: mm/memory_hotplug.c +.. kernel-doc:: mm/mmu_notifier.c +.. kernel-doc:: mm/balloon_compaction.c +.. kernel-doc:: mm/huge_memory.c +.. kernel-doc:: mm/io-mapping.c diff --git a/Documentation/mm/highmem.rst b/Documentation/mm/highmem.rst index c964e084870282..aefb03eb386ec1 100644 --- a/Documentation/mm/highmem.rst +++ b/Documentation/mm/highmem.rst @@ -206,4 +206,5 @@ Functions ========= .. kernel-doc:: include/linux/highmem.h +.. kernel-doc:: mm/highmem.c .. kernel-doc:: include/linux/highmem-internal.h diff --git a/Documentation/mm/zsmalloc.rst b/Documentation/mm/zsmalloc.rst index a3c26d587752fe..76902835e68e94 100644 --- a/Documentation/mm/zsmalloc.rst +++ b/Documentation/mm/zsmalloc.rst @@ -263,3 +263,8 @@ is heavy internal fragmentation and zspool compaction is unable to relocate objects and release zspages. In these cases, it is recommended to decrease the limit on the size of the zspage chains (as specified by the CONFIG_ZSMALLOC_CHAIN_SIZE option). + +Functions +========= + +.. kernel-doc:: mm/zsmalloc.c From 8cfd014efd93e9450fcd4892bbfe8b10f41e53c3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 18:24:59 +0100 Subject: [PATCH 485/489] hugetlb: add documentation for vma_kernel_pagesize() This is an exported symbol, so it should have kernel-doc. Update it to mention folios, and point out that they might be larger than the supported page size for this VMA. Link: https://lkml.kernel.org/r/20230822172459.4190699-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cbc25826c9b04c..ba6d39b71cb143 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -968,9 +968,14 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma, } EXPORT_SYMBOL_GPL(linear_hugepage_index); -/* - * Return the size of the pages allocated when backing a VMA. In the majority - * cases this will be same size as used by the page table entries. +/** + * vma_kernel_pagesize - Page size granularity for this VMA. + * @vma: The user mapping. + * + * Folios in this VMA will be aligned to, and at least the size of the + * number of bytes returned by this function. + * + * Return: The default size of the folios allocated when backing a VMA. */ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) { From 7db15418d390cf878d7c77ae08a4ad39f1534bc5 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 22 Aug 2023 16:27:49 +0200 Subject: [PATCH 486/489] nios2: fix flush_dcache_page() for usage from irq context Since at least kernel 6.1, flush_dcache_page() is called with IRQs disabled, e.g. from aio_complete(). But the current implementation for flush_dcache_page() on NIOS2 unintentionally re-enables IRQs, which may lead to deadlocks. Fix it by using xa_lock_irqsave() and xa_unlock_irqrestore() for the flush_dcache_mmap_*lock() macros instead. Link: https://lkml.kernel.org/r/ZOTF5WWURQNH9+iw@p100 Signed-off-by: Helge Deller Cc: Dinh Nguyen Signed-off-by: Andrew Morton --- arch/nios2/include/asm/cacheflush.h | 4 ++++ arch/nios2/mm/cacheflush.c | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/nios2/include/asm/cacheflush.h b/arch/nios2/include/asm/cacheflush.h index 7c48c5213fb7a3..348cea0977927a 100644 --- a/arch/nios2/include/asm/cacheflush.h +++ b/arch/nios2/include/asm/cacheflush.h @@ -52,5 +52,9 @@ extern void invalidate_dcache_range(unsigned long start, unsigned long end); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) +#define flush_dcache_mmap_lock_irqsave(mapping, flags) \ + xa_lock_irqsave(&mapping->i_pages, flags) +#define flush_dcache_mmap_unlock_irqrestore(mapping, flags) \ + xa_unlock_irqrestore(&mapping->i_pages, flags) #endif /* _ASM_NIOS2_CACHEFLUSH_H */ diff --git a/arch/nios2/mm/cacheflush.c b/arch/nios2/mm/cacheflush.c index 28b805f465a8b7..0ee9c5f02e08eb 100644 --- a/arch/nios2/mm/cacheflush.c +++ b/arch/nios2/mm/cacheflush.c @@ -75,12 +75,13 @@ static void flush_aliases(struct address_space *mapping, struct folio *folio) { struct mm_struct *mm = current->active_mm; struct vm_area_struct *vma; + unsigned long flags; pgoff_t pgoff; unsigned long nr = folio_nr_pages(folio); pgoff = folio->index; - flush_dcache_mmap_lock(mapping); + flush_dcache_mmap_lock_irqsave(mapping, flags); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff + nr - 1) { unsigned long start; @@ -92,7 +93,7 @@ static void flush_aliases(struct address_space *mapping, struct folio *folio) start = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); flush_cache_range(vma, start, start + nr * PAGE_SIZE); } - flush_dcache_mmap_unlock(mapping); + flush_dcache_mmap_unlock_irqrestore(mapping, flags); } void flush_cache_all(void) From 8f9ff2deb8b91ad1cba666c7adbe4ca79e2d3225 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 22 Aug 2023 21:23:35 +0100 Subject: [PATCH 487/489] secretmem: convert page_is_secretmem() to folio_is_secretmem() The only caller already has a folio, so use it to save calling compound_head() in PageLRU() and remove a use of page->mapping. Link: https://lkml.kernel.org/r/20230822202335.179081-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Mike Rapoport (IBM) Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton --- include/linux/secretmem.h | 15 +++++++-------- mm/gup.c | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/secretmem.h b/include/linux/secretmem.h index 988528b5da438f..35f3a4a8ceb1e3 100644 --- a/include/linux/secretmem.h +++ b/include/linux/secretmem.h @@ -6,24 +6,23 @@ extern const struct address_space_operations secretmem_aops; -static inline bool page_is_secretmem(struct page *page) +static inline bool folio_is_secretmem(struct folio *folio) { struct address_space *mapping; /* - * Using page_mapping() is quite slow because of the actual call - * instruction and repeated compound_head(page) inside the - * page_mapping() function. + * Using folio_mapping() is quite slow because of the actual call + * instruction. * We know that secretmem pages are not compound and LRU so we can * save a couple of cycles here. */ - if (PageCompound(page) || !PageLRU(page)) + if (folio_test_large(folio) || !folio_test_lru(folio)) return false; mapping = (struct address_space *) - ((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); + ((unsigned long)folio->mapping & ~PAGE_MAPPING_FLAGS); - if (!mapping || mapping != page->mapping) + if (!mapping || mapping != folio->mapping) return false; return mapping->a_ops == &secretmem_aops; @@ -39,7 +38,7 @@ static inline bool vma_is_secretmem(struct vm_area_struct *vma) return false; } -static inline bool page_is_secretmem(struct page *page) +static inline bool folio_is_secretmem(struct folio *folio) { return false; } diff --git a/mm/gup.c b/mm/gup.c index ee4fc15ce88eb2..948f3b454b001a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2600,7 +2600,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, if (!folio) goto pte_unmap; - if (unlikely(page_is_secretmem(page))) { + if (unlikely(folio_is_secretmem(folio))) { gup_put_folio(folio, 1, flags); goto pte_unmap; } From 432af5c966667f12c7af38fb3b2cd52eef0c47b4 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 18 Aug 2023 20:43:56 -0400 Subject: [PATCH 488/489] maple_tree: clean up mas_wr_append() Avoid setting the variables until necessary, and actually use the variables where applicable. Introducing a variable for the slots array avoids spanning multiple lines. Add the missing argument to the documentation. Use the node type when setting the metadata instead of blindly assuming the type. Finally, add a trace point to the function for successful store. Link: https://lkml.kernel.org/r/20230819004356.1454718-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 05d5db255c398b..ee1ff0c59fd753 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4106,6 +4106,7 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) /* * mas_wr_append: Attempt to append * @wr_mas: the maple write state + * @new_end: The end of the node after the modification * * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in @@ -4114,42 +4115,46 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * Return: True if appended, false otherwise */ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, - unsigned char new_end) + unsigned char new_end) { - unsigned char end = wr_mas->node_end; - struct ma_state *mas = wr_mas->mas; - unsigned char node_pivots = mt_pivots[wr_mas->type]; + struct ma_state *mas; + void __rcu **slots; + unsigned char end; + mas = wr_mas->mas; if (mt_in_rcu(mas->tree)) return false; if (mas->offset != wr_mas->node_end) return false; - if (new_end < node_pivots) { + end = wr_mas->node_end; + if (mas->offset != end) + return false; + + if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; - ma_set_meta(wr_mas->node, maple_leaf_64, 0, new_end); + ma_set_meta(wr_mas->node, wr_mas->type, 0, new_end); } - if (new_end == wr_mas->node_end + 1) { + slots = wr_mas->slots; + if (new_end == end + 1) { if (mas->last == wr_mas->r_max) { /* Append to end of range */ - rcu_assign_pointer(wr_mas->slots[new_end], - wr_mas->entry); + rcu_assign_pointer(slots[new_end], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = new_end; } else { /* Append to start of range */ - rcu_assign_pointer(wr_mas->slots[new_end], - wr_mas->content); + rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end] = mas->last; - rcu_assign_pointer(wr_mas->slots[end], wr_mas->entry); + rcu_assign_pointer(slots[end], wr_mas->entry); } } else { /* Append to the range without touching any boundaries. */ - rcu_assign_pointer(wr_mas->slots[new_end], wr_mas->content); + rcu_assign_pointer(slots[new_end], wr_mas->content); wr_mas->pivots[end + 1] = mas->last; - rcu_assign_pointer(wr_mas->slots[end + 1], wr_mas->entry); + rcu_assign_pointer(slots[end + 1], wr_mas->entry); wr_mas->pivots[end] = mas->index - 1; mas->offset = end + 1; } @@ -4157,6 +4162,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, if (!wr_mas->content || !wr_mas->entry) mas_update_gap(mas); + trace_ma_write(__func__, mas, new_end, wr_mas->entry); return true; } From 52ae298e3e5c9be5bb95e1c6d9199e5210f2a156 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 22 Aug 2023 00:51:45 +0200 Subject: [PATCH 489/489] maple_tree: shrink struct maple_tree Pack the members of struct maple_tree to avoid holes on 64-bit. The size shrinks from 24 to 16 bytes which will save eight bytes in every structure which embeds it. [willy@infradead.org: changelog alterations] Link: https://lkml.kernel.org/r/20230821225145.2169848-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Reviewed-by: Liam R. Howlett Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index c962af1886813f..e41c70ac7744e4 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -220,8 +220,8 @@ struct maple_tree { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; - void __rcu *ma_root; unsigned int ma_flags; + void __rcu *ma_root; }; /**