From 8ff60eb052eeba95cfb3efe16b08c9199f8121cf Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 12 Jan 2021 15:49:04 -0800 Subject: [PATCH 01/10] mm, slub: consider rest of partial list if acquire_slab() fails acquire_slab() fails if there is contention on the freelist of the page (probably because some other CPU is concurrently freeing an object from the page). In that case, it might make sense to look for a different page (since there might be more remote frees to the page from other CPUs, and we don't want contention on struct page). However, the current code accidentally stops looking at the partial list completely in that case. Especially on kernels without CONFIG_NUMA set, this means that get_partial() fails and new_slab_objects() falls back to new_slab(), allocating new pages. This could lead to an unnecessary increase in memory fragmentation. Link: https://lkml.kernel.org/r/20201228130853.1871516-1-jannh@google.com Fixes: 7ced37197196 ("slub: Acquire_slab() avoid loop") Signed-off-by: Jann Horn Acked-by: David Rientjes Acked-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index dc5b42e700b8..d9e4e10683cc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1973,7 +1973,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) - break; + continue; /* cmpxchg raced */ available += objects; if (!object) { From ce8f86ee94fabcc98537ddccd7e82cfd360a4dc5 Mon Sep 17 00:00:00 2001 From: Hailong liu Date: Tue, 12 Jan 2021 15:49:08 -0800 Subject: [PATCH 02/10] mm/page_alloc: add a missing mm_page_alloc_zone_locked() tracepoint The trace point *trace_mm_page_alloc_zone_locked()* in __rmqueue() does not currently cover all branches. Add the missing tracepoint and check the page before do that. [akpm@linux-foundation.org: use IS_ENABLED() to suppress warning] Link: https://lkml.kernel.org/r/20201228132901.41523-1-carver4lio@163.com Signed-off-by: Hailong liu Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdbec4c98173..027f6481ba59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2862,20 +2862,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; -#ifdef CONFIG_CMA - /* - * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. - */ - if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { - page = __rmqueue_cma_fallback(zone, order); - if (page) - return page; + if (IS_ENABLED(CONFIG_CMA)) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (alloc_flags & ALLOC_CMA && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + goto out; + } } -#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -2886,8 +2886,9 @@ retry: alloc_flags)) goto retry; } - - trace_mm_page_alloc_zone_locked(page, order, migratetype); +out: + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } From 7ea510b92c7c9b4eb5ff72e6b4bbad4b0407a914 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 12 Jan 2021 15:49:11 -0800 Subject: [PATCH 03/10] mm/memcontrol: fix warning in mem_cgroup_page_lruvec() Boot a CONFIG_MEMCG=y kernel with "cgroup_disabled=memory" and you are met by a series of warnings from the VM_WARN_ON_ONCE_PAGE(!memcg, page) recently added to the inline mem_cgroup_page_lruvec(). An earlier attempt to place that warning, in mem_cgroup_lruvec(), had been careful to do so after weeding out the mem_cgroup_disabled() case; but was itself invalid because of the mem_cgroup_lruvec(NULL, pgdat) in clear_pgdat_congested() and age_active_anon(). Warning in mem_cgroup_page_lruvec() was once useful in detecting a KSM charge bug, so may be worth keeping: but skip if mem_cgroup_disabled(). Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2101032056260.1093@eggly.anvils Fixes: 9a1ac2288cf1 ("mm/memcontrol:rewrite mem_cgroup_page_lruvec()") Signed-off-by: Hugh Dickins Reviewed-by: Alex Shi Acked-by: Roman Gushchin Acked-by: Chris Down Reviewed-by: Baoquan He Acked-by: Vlastimil Babka Cc: Hui Su Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d827bd7f3bfe..eeb0b52203e9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -665,7 +665,7 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, { struct mem_cgroup *memcg = page_memcg(page); - VM_WARN_ON_ONCE_PAGE(!memcg, page); + VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page); return mem_cgroup_lruvec(memcg, pgdat); } From 29970dc24faf0078beb4efab5455b4f504d2198d Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Tue, 12 Jan 2021 15:49:14 -0800 Subject: [PATCH 04/10] arm/kasan: fix the array size of kasan_early_shadow_pte[] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The size of kasan_early_shadow_pte[] now is PTRS_PER_PTE which defined to 512 for arm. This means that it only covers the prev Linux pte entries, but not the HWTABLE pte entries for arm. The reason it currently works is that the symbol kasan_early_shadow_page immediately following kasan_early_shadow_pte in memory is page aligned, which makes kasan_early_shadow_pte look like a 4KB size array. But we can't ensure the order is always right with different compiler/linker, or if more bss symbols are introduced. We had a test with QEMU + vexpress:put a 512KB-size symbol with attribute __section(".bss..page_aligned") after kasan_early_shadow_pte, and poisoned it after kasan_early_init(). Then enabled CONFIG_KASAN, it failed to boot up. Link: https://lkml.kernel.org/r/20210109044622.8312-1-hailongliiu@yeah.net Signed-off-by: Hailong Liu Signed-off-by: Ziliang Guo Reviewed-by: Linus Walleij Cc: Andrey Ryabinin Cc: Russell King Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++++- mm/kasan/init.c | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5e0655fb2a6f..fe1ae73ff8b5 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -35,8 +35,12 @@ struct kunit_kasan_expectation { #define KASAN_SHADOW_INIT 0 #endif +#ifndef PTE_HWTABLE_PTRS +#define PTE_HWTABLE_PTRS 0 +#endif + extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; -extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE]; +extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]; extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; diff --git a/mm/kasan/init.c b/mm/kasan/init.c index bc0ad208b3a7..7ca0b92d5886 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -64,7 +64,8 @@ static inline bool kasan_pmd_table(pud_t pud) return false; } #endif -pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; +pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS] + __page_aligned_bss; static inline bool kasan_pte_table(pmd_t pmd) { From c22ee5284cf58017fa8c6d21d8f8c68159b6faab Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Jan 2021 15:49:18 -0800 Subject: [PATCH 05/10] mm/vmalloc.c: fix potential memory leak In VM_MAP_PUT_PAGES case, we should put pages and free array in vfree. But we missed to set area->nr_pages in vmap(). So we would fail to put pages in __vunmap() because area->nr_pages = 0. Link: https://lkml.kernel.org/r/20210107123541.39206-1-linmiaohe@huawei.com Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap") Signed-off-by: Shijie Luo Signed-off-by: Miaohe Lin Reviewed-by: Uladzislau Rezki (Sony) Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d88fe5a277a..e6f352bf0498 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2420,8 +2420,10 @@ void *vmap(struct page **pages, unsigned int count, return NULL; } - if (flags & VM_MAP_PUT_PAGES) + if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; + area->nr_pages = count; + } return area->addr; } EXPORT_SYMBOL(vmap); From f555befd185dc097ede887eb7b308c2e1c1369d4 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Tue, 12 Jan 2021 15:49:21 -0800 Subject: [PATCH 06/10] mm: migrate: initialize err in do_migrate_pages After commit 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")', do_migrate_pages can return uninitialized variable 'err' (which is propagated to user-space as error) when 'from' and 'to' nodesets are identical. This can be reproduced with LTP migrate_pages01, which calls migrate_pages() with same set for both old/new_nodes. Add 'err' initialization back. Link: https://lkml.kernel.org/r/456a021c7ef3636d7668cec9dcb4a446a4244812.1609855564.git.jstancek@redhat.com Fixes: 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}") Signed-off-by: Jan Stancek Acked-by: Michal Hocko Acked-by: Yang Shi Cc: Zi Yan Cc: Jan Kara Cc: Matthew Wilcox Cc: Mel Gorman Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cf96bd21341..2c3a86502053 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1111,7 +1111,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { int busy = 0; - int err; + int err = 0; nodemask_t tmp; migrate_prep(); From 0eb98f1588c2cc7a79816d84ab18a55d254f481c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Jan 2021 15:49:24 -0800 Subject: [PATCH 07/10] mm/hugetlb: fix potential missing huge page size info The huge page size is encoded for VM_FAULT_HWPOISON errors only. So if we return VM_FAULT_HWPOISON, huge page size would just be ignored. Link: https://lkml.kernel.org/r/20210107123449.38481-1-linmiaohe@huawei.com Fixes: aa50d3a7aa81 ("Encode huge page size for VM_FAULT_HWPOISON errors") Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a2602969873d..18f6ee317900 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4371,7 +4371,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } From 7e5f1126b54a29c078c07a5fe245e269f3c05500 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 12 Jan 2021 15:49:27 -0800 Subject: [PATCH 08/10] MAINTAINERS: add Vlastimil as slab allocators maintainer I would like to help with slab allocators maintenance, from the perspective of being responsible for SLAB and more recently also SLUB in an enterprise distro kernel and supporting its users. Recently I've been focusing on improving SLUB's debugging features, and patch review in the area, including the kmemcg accounting rewrite last year. Link: https://lkml.kernel.org/r/20210108110353.19971-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cc1e6a5ee6e6..28cbe0ec6610 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16319,6 +16319,7 @@ M: Pekka Enberg M: David Rientjes M: Joonsoo Kim M: Andrew Morton +M: Vlastimil Babka L: linux-mm@kvack.org S: Maintained F: include/linux/sl?b*.h From 6696d2a6f38c0beedf03c381edfc392ecf7631b4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 12 Jan 2021 15:49:30 -0800 Subject: [PATCH 09/10] mm,hwpoison: fix printing of page flags Format %pG expects a lower case 'p' in order to print the flags. Fix it. Link: https://lkml.kernel.org/r/20210108085202.4506-1-osalvador@suse.de Fixes: 8295d535e2aa ("mm,hwpoison: refactor get_any_page") Signed-off-by: Oscar Salvador Reported-by: Dan Carpenter Reviewed-by: Anshuman Khandual Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5a38e9eade94..04d9f154a130 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1940,7 +1940,7 @@ retry: goto retry; } } else if (ret == -EIO) { - pr_info("%s: %#lx: unknown page type: %lx (%pGP)\n", + pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", __func__, pfn, page->flags, &page->flags); } From eb351d75ce1e75b4f793d609efac08426ca50acd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 12 Jan 2021 15:49:33 -0800 Subject: [PATCH 10/10] mm/process_vm_access.c: include compat.h Fix the build error: mm/process_vm_access.c:277:5: error: implicit declaration of function 'in_compat_syscall'; did you mean 'in_ia32_syscall'? [-Werror=implicit-function-declaration] Fixes: 38dc5079da7081e "Fix compat regression in process_vm_rw()" Reported-by: syzbot+5b0d0de84d6c65b8dd2b@syzkaller.appspotmail.com Cc: Kyle Huey Cc: Jens Axboe Cc: Al Viro Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 4bcc11958089..f5fee9cf90f8 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include