From ca0246bb97c23da9d267c2107c07fb77e38205c9 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Fri, 16 Nov 2018 15:07:56 -0800 Subject: [PATCH 01/16] z3fold: fix possible reclaim races Reclaim and free can race on an object which is basically fine but in order for reclaim to be able to map "freed" object we need to encode object length in the handle. handle_to_chunks() is then introduced to extract object length from a handle and use it during mapping. Moreover, to avoid racing on a z3fold "headless" page release, we should not try to free that page in z3fold_free() if the reclaim bit is set. Also, in the unlikely case of trying to reclaim a page being freed, we should not proceed with that page. While at it, fix the page accounting in reclaim function. This patch supersedes "[PATCH] z3fold: fix reclaim lock-ups". Link: http://lkml.kernel.org/r/20181105162225.74e8837d03583a9b707cf559@gmail.com Signed-off-by: Vitaly Wool Signed-off-by: Jongseok Kim Reported-by-by: Jongseok Kim Reviewed-by: Snild Dolkow Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/z3fold.c | 103 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/mm/z3fold.c b/mm/z3fold.c index 4b366d181f35..aee9b0b8d907 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -99,6 +99,7 @@ struct z3fold_header { #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) #define BUDDY_MASK (0x3) +#define BUDDY_SHIFT 2 /** * struct z3fold_pool - stores metadata for each z3fold pool @@ -145,7 +146,7 @@ enum z3fold_page_flags { MIDDLE_CHUNK_MAPPED, NEEDS_COMPACTING, PAGE_STALE, - UNDER_RECLAIM + PAGE_CLAIMED, /* by either reclaim or free */ }; /***************** @@ -174,7 +175,7 @@ static struct z3fold_header *init_z3fold_page(struct page *page, clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); clear_bit(NEEDS_COMPACTING, &page->private); clear_bit(PAGE_STALE, &page->private); - clear_bit(UNDER_RECLAIM, &page->private); + clear_bit(PAGE_CLAIMED, &page->private); spin_lock_init(&zhdr->page_lock); kref_init(&zhdr->refcount); @@ -223,8 +224,11 @@ static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) unsigned long handle; handle = (unsigned long)zhdr; - if (bud != HEADLESS) - handle += (bud + zhdr->first_num) & BUDDY_MASK; + if (bud != HEADLESS) { + handle |= (bud + zhdr->first_num) & BUDDY_MASK; + if (bud == LAST) + handle |= (zhdr->last_chunks << BUDDY_SHIFT); + } return handle; } @@ -234,6 +238,12 @@ static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) return (struct z3fold_header *)(handle & PAGE_MASK); } +/* only for LAST bud, returns zero otherwise */ +static unsigned short handle_to_chunks(unsigned long handle) +{ + return (handle & ~PAGE_MASK) >> BUDDY_SHIFT; +} + /* * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle * but that doesn't matter. because the masking will result in the @@ -720,37 +730,39 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) page = virt_to_page(zhdr); if (test_bit(PAGE_HEADLESS, &page->private)) { - /* HEADLESS page stored */ - bud = HEADLESS; - } else { - z3fold_page_lock(zhdr); - bud = handle_to_buddy(handle); - - switch (bud) { - case FIRST: - zhdr->first_chunks = 0; - break; - case MIDDLE: - zhdr->middle_chunks = 0; - zhdr->start_middle = 0; - break; - case LAST: - zhdr->last_chunks = 0; - break; - default: - pr_err("%s: unknown bud %d\n", __func__, bud); - WARN_ON(1); - z3fold_page_unlock(zhdr); - return; + /* if a headless page is under reclaim, just leave. + * NB: we use test_and_set_bit for a reason: if the bit + * has not been set before, we release this page + * immediately so we don't care about its value any more. + */ + if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) { + spin_lock(&pool->lock); + list_del(&page->lru); + spin_unlock(&pool->lock); + free_z3fold_page(page); + atomic64_dec(&pool->pages_nr); } + return; } - if (bud == HEADLESS) { - spin_lock(&pool->lock); - list_del(&page->lru); - spin_unlock(&pool->lock); - free_z3fold_page(page); - atomic64_dec(&pool->pages_nr); + /* Non-headless case */ + z3fold_page_lock(zhdr); + bud = handle_to_buddy(handle); + + switch (bud) { + case FIRST: + zhdr->first_chunks = 0; + break; + case MIDDLE: + zhdr->middle_chunks = 0; + break; + case LAST: + zhdr->last_chunks = 0; + break; + default: + pr_err("%s: unknown bud %d\n", __func__, bud); + WARN_ON(1); + z3fold_page_unlock(zhdr); return; } @@ -758,7 +770,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) atomic64_dec(&pool->pages_nr); return; } - if (test_bit(UNDER_RECLAIM, &page->private)) { + if (test_bit(PAGE_CLAIMED, &page->private)) { z3fold_page_unlock(zhdr); return; } @@ -836,20 +848,30 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) } list_for_each_prev(pos, &pool->lru) { page = list_entry(pos, struct page, lru); - if (test_bit(PAGE_HEADLESS, &page->private)) - /* candidate found */ - break; + + /* this bit could have been set by free, in which case + * we pass over to the next page in the pool. + */ + if (test_and_set_bit(PAGE_CLAIMED, &page->private)) + continue; zhdr = page_address(page); - if (!z3fold_page_trylock(zhdr)) + if (test_bit(PAGE_HEADLESS, &page->private)) + break; + + if (!z3fold_page_trylock(zhdr)) { + zhdr = NULL; continue; /* can't evict at this point */ + } kref_get(&zhdr->refcount); list_del_init(&zhdr->buddy); zhdr->cpu = -1; - set_bit(UNDER_RECLAIM, &page->private); break; } + if (!zhdr) + break; + list_del_init(&page->lru); spin_unlock(&pool->lock); @@ -898,6 +920,7 @@ next: if (test_bit(PAGE_HEADLESS, &page->private)) { if (ret == 0) { free_z3fold_page(page); + atomic64_dec(&pool->pages_nr); return 0; } spin_lock(&pool->lock); @@ -905,7 +928,7 @@ next: spin_unlock(&pool->lock); } else { z3fold_page_lock(zhdr); - clear_bit(UNDER_RECLAIM, &page->private); + clear_bit(PAGE_CLAIMED, &page->private); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); @@ -964,7 +987,7 @@ static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) set_bit(MIDDLE_CHUNK_MAPPED, &page->private); break; case LAST: - addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT); break; default: pr_err("unknown buddy id %d\n", buddy); From 8fcb2312d1e3300e81aa871aad00d4c038cfc184 Mon Sep 17 00:00:00 2001 From: Olof Johansson Date: Fri, 16 Nov 2018 15:08:00 -0800 Subject: [PATCH 02/16] kernel/sched/psi.c: simplify cgroup_move_task() The existing code triggered an invalid warning about 'rq' possibly being used uninitialized. Instead of doing the silly warning suppression by initializa it to NULL, refactor the code to bail out early instead. Warning was: kernel/sched/psi.c: In function `cgroup_move_task': kernel/sched/psi.c:639:13: warning: `rq' may be used uninitialized in this function [-Wmaybe-uninitialized] Link: http://lkml.kernel.org/r/20181103183339.8669-1-olof@lixom.net Fixes: 2ce7135adc9ad ("psi: cgroup support") Signed-off-by: Olof Johansson Reviewed-by: Andrew Morton Acked-by: Johannes Weiner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/psi.c | 47 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7cdecfc010af..3d7355d7c3e3 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -633,38 +633,39 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - bool move_psi = !psi_disabled; unsigned int task_flags = 0; struct rq_flags rf; struct rq *rq; - if (move_psi) { - rq = task_rq_lock(task, &rf); - - if (task_on_rq_queued(task)) - task_flags = TSK_RUNNING; - else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->flags & PF_MEMSTALL) - task_flags |= TSK_MEMSTALL; - - if (task_flags) - psi_task_change(task, task_flags, 0); + if (psi_disabled) { + /* + * Lame to do this here, but the scheduler cannot be locked + * from the outside, so we move cgroups from inside sched/. + */ + rcu_assign_pointer(task->cgroups, to); + return; } - /* - * Lame to do this here, but the scheduler cannot be locked - * from the outside, so we move cgroups from inside sched/. - */ + rq = task_rq_lock(task, &rf); + + if (task_on_rq_queued(task)) + task_flags = TSK_RUNNING; + else if (task->in_iowait) + task_flags = TSK_IOWAIT; + + if (task->flags & PF_MEMSTALL) + task_flags |= TSK_MEMSTALL; + + if (task_flags) + psi_task_change(task, task_flags, 0); + + /* See comment above */ rcu_assign_pointer(task->cgroups, to); - if (move_psi) { - if (task_flags) - psi_task_change(task, 0, task_flags); + if (task_flags) + psi_task_change(task, 0, task_flags); - task_rq_unlock(rq, task, &rf); - } + task_rq_unlock(rq, task, &rf); } #endif /* CONFIG_CGROUPS */ From 5e41540c8a0f0e98c337dda8b391e5dda0cde7cf Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Nov 2018 15:08:04 -0800 Subject: [PATCH 03/16] hugetlbfs: fix kernel BUG at fs/hugetlbfs/inode.c:444! This bug has been experienced several times by the Oracle DB team. The BUG is in remove_inode_hugepages() as follows: /* * If page is mapped, it was faulted in after being * unmapped in caller. Unmap (again) now after taking * the fault mutex. The mutex will prevent faults * until we finish removing the page. * * This race can only happen in the hole punch case. * Getting here in a truncate operation is a bug. */ if (unlikely(page_mapped(page))) { BUG_ON(truncate_op); In this case, the elevated map count is not the result of a race. Rather it was incorrectly incremented as the result of a bug in the huge pmd sharing code. Consider the following: - Process A maps a hugetlbfs file of sufficient size and alignment (PUD_SIZE) that a pmd page could be shared. - Process B maps the same hugetlbfs file with the same size and alignment such that a pmd page is shared. - Process B then calls mprotect() to change protections for the mapping with the shared pmd. As a result, the pmd is 'unshared'. - Process B then calls mprotect() again to chage protections for the mapping back to their original value. pmd remains unshared. - Process B then forks and process C is created. During the fork process, we do dup_mm -> dup_mmap -> copy_page_range to copy page tables. Copying page tables for hugetlb mappings is done in the routine copy_hugetlb_page_range. In copy_hugetlb_page_range(), the destination pte is obtained by: dst_pte = huge_pte_alloc(dst, addr, sz); If pmd sharing is possible, the returned pointer will be to a pte in an existing page table. In the situation above, process C could share with either process A or process B. Since process A is first in the list, the returned pte is a pointer to a pte in process A's page table. However, the check for pmd sharing in copy_hugetlb_page_range is: /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) continue; Since process C is sharing with process A instead of process B, the above test fails. The code in copy_hugetlb_page_range which follows assumes dst_pte points to a huge_pte_none pte. It copies the pte entry from src_pte to dst_pte and increments this map count of the associated page. This is how we end up with an elevated map count. To solve, check the dst_pte entry for huge_pte_none. If !none, this implies PMD sharing so do not copy. Link: http://lkml.kernel.org/r/20181105212315.14125-1-mike.kravetz@oracle.com Fixes: c5c99429fa57 ("fix hugepages leak due to pagetable page sharing") Signed-off-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Michal Hocko Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Davidlohr Bueso Cc: Prakash Sangappa Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c007fb5fb8d5..7f2a28ab46d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3233,7 +3233,7 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { - pte_t *src_pte, *dst_pte, entry; + pte_t *src_pte, *dst_pte, entry, dst_entry; struct page *ptepage; unsigned long addr; int cow; @@ -3261,15 +3261,30 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, break; } - /* If the pagetables are shared don't copy or take references */ - if (dst_pte == src_pte) + /* + * If the pagetables are shared don't copy or take references. + * dst_pte == src_pte is the common case of src/dest sharing. + * + * However, src could have 'unshared' and dst shares with + * another vma. If dst_pte !none, this implies sharing. + * Check here before taking page table lock, and once again + * after taking the lock below. + */ + dst_entry = huge_ptep_get(dst_pte); + if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) continue; dst_ptl = huge_pte_lock(h, dst, dst_pte); src_ptl = huge_pte_lockptr(h, src, src_pte); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); - if (huge_pte_none(entry)) { /* skip none entry */ + dst_entry = huge_ptep_get(dst_pte); + if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) { + /* + * Skip if src entry none. Also, skip in the + * unlikely case dst entry !none as this implies + * sharing with another vma. + */ ; } else if (unlikely(is_hugetlb_entry_migration(entry) || is_hugetlb_entry_hwpoisoned(entry))) { From f341e16fb67ddc199582d187b0d39120a9dfd2bf Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Fri, 16 Nov 2018 15:08:08 -0800 Subject: [PATCH 04/16] MAINTAINERS: update OMAP MMC entry Jarkko's e-mail address hasn't worked for a long time. We still want to keep this driver working as it is critical for some of the OMAP boards. I use and test this driver frequently, so change myself as a maintainer with "Odd Fixes" status. Link: http://lkml.kernel.org/r/20181106222750.12939-1-aaro.koskinen@iki.fi Signed-off-by: Aaro Koskinen Acked-by: Tony Lindgren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- CREDITS | 4 ++++ MAINTAINERS | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 5befd2d714d0..84cbec4c6211 100644 --- a/CREDITS +++ b/CREDITS @@ -2138,6 +2138,10 @@ E: paul@laufernet.com D: Soundblaster driver fixes, ISAPnP quirk S: California, USA +N: Jarkko Lavinen +E: jarkko.lavinen@nokia.com +D: OMAP MMC support + N: Jonathan Layes D: ARPD support diff --git a/MAINTAINERS b/MAINTAINERS index 6c3fbbb361f8..b755a89fa325 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10808,9 +10808,9 @@ F: drivers/media/platform/omap3isp/ F: drivers/staging/media/omap4iss/ OMAP MMC SUPPORT -M: Jarkko Lavinen +M: Aaro Koskinen L: linux-omap@vger.kernel.org -S: Maintained +S: Odd Fixes F: drivers/mmc/host/omap.c OMAP POWER MANAGEMENT SUPPORT From 873d7bcfd066663e3e50113dc4a0de19289b6354 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Fri, 16 Nov 2018 15:08:11 -0800 Subject: [PATCH 05/16] mm/swapfile.c: use kvzalloc for swap_info_struct allocation Commit a2468cc9bfdf ("swap: choose swap device according to numa node") changed 'avail_lists' field of 'struct swap_info_struct' to an array. In popular linux distros it increased size of swap_info_struct up to 40 Kbytes and now swap_info_struct allocation requires order-4 page. Switch to kvzmalloc allows to avoid unexpected allocation failures. Link: http://lkml.kernel.org/r/fc23172d-3c75-21e2-d551-8b1808cbe593@virtuozzo.com Fixes: a2468cc9bfdf ("swap: choose swap device according to numa node") Signed-off-by: Vasily Averin Acked-by: Aaron Lu Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Huang Ying Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 644f746e167a..8688ae65ef58 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2813,7 +2813,7 @@ static struct swap_info_struct *alloc_swap_info(void) unsigned int type; int i; - p = kzalloc(sizeof(*p), GFP_KERNEL); + p = kvzalloc(sizeof(*p), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -2824,7 +2824,7 @@ static struct swap_info_struct *alloc_swap_info(void) } if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); - kfree(p); + kvfree(p); return ERR_PTR(-EPERM); } if (type >= nr_swapfiles) { @@ -2838,7 +2838,7 @@ static struct swap_info_struct *alloc_swap_info(void) smp_wmb(); nr_swapfiles++; } else { - kfree(p); + kvfree(p); p = swap_info[type]; /* * Do not memset this entry: a racing procfs swap_next() From 9d7899999c62c1a81129b76d2a6ecbc4655e1597 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Nov 2018 15:08:15 -0800 Subject: [PATCH 06/16] mm, memory_hotplug: check zone_movable in has_unmovable_pages Page state checks are racy. Under a heavy memory workload (e.g. stress -m 200 -t 2h) it is quite easy to hit a race window when the page is allocated but its state is not fully populated yet. A debugging patch to dump the struct page state shows has_unmovable_pages: pfn:0x10dfec00, found:0x1, count:0x0 page:ffffea0437fb0000 count:1 mapcount:1 mapping:ffff880e05239841 index:0x7f26e5000 compound_mapcount: 1 flags: 0x5fffffc0090034(uptodate|lru|active|head|swapbacked) Note that the state has been checked for both PageLRU and PageSwapBacked already. Closing this race completely would require some sort of retry logic. This can be tricky and error prone (think of potential endless or long taking loops). Workaround this problem for movable zones at least. Such a zone should only contain movable pages. Commit 15c30bc09085 ("mm, memory_hotplug: make has_unmovable_pages more robust") has told us that this is not strictly true though. Bootmem pages should be marked reserved though so we can move the original check after the PageReserved check. Pages from other zones are still prone to races but we even do not pretend that memory hotremove works for those so pre-mature failure doesn't hurt that much. Link: http://lkml.kernel.org/r/20181106095524.14629-1-mhocko@kernel.org Fixes: 15c30bc09085 ("mm, memory_hotplug: make has_unmovable_pages more robust") Signed-off-by: Michal Hocko Reported-by: Baoquan He Tested-by: Baoquan He Acked-by: Baoquan He Reviewed-by: Oscar Salvador Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a919ba5cb3c8..a0e5ec0addd2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7788,6 +7788,14 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (PageReserved(page)) goto unmovable; + /* + * If the zone is movable and we have ruled out all reserved + * pages then it should be reasonably safe to assume the rest + * is movable. + */ + if (zone_idx(zone) == ZONE_MOVABLE) + continue; + /* * Hugepages are not in LRU lists, but they're movable. * We need not scan over tail pages bacause we don't From a76cf1a474d7dbcd9336b5f5afb0162baa142cf0 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 16 Nov 2018 15:08:18 -0800 Subject: [PATCH 07/16] mm: don't reclaim inodes with many attached pages Spock reported that commit 172b06c32b94 ("mm: slowly shrink slabs with a relatively small number of objects") leads to a regression on his setup: periodically the majority of the pagecache is evicted without an obvious reason, while before the change the amount of free memory was balancing around the watermark. The reason behind is that the mentioned above change created some minimal background pressure on the inode cache. The problem is that if an inode is considered to be reclaimed, all belonging pagecache page are stripped, no matter how many of them are there. So, if a huge multi-gigabyte file is cached in the memory, and the goal is to reclaim only few slab objects (unused inodes), we still can eventually evict all gigabytes of the pagecache at once. The workload described by Spock has few large non-mapped files in the pagecache, so it's especially noticeable. To solve the problem let's postpone the reclaim of inodes, which have more than 1 attached page. Let's wait until the pagecache pages will be evicted naturally by scanning the corresponding LRU lists, and only then reclaim the inode structure. Link: http://lkml.kernel.org/r/20181023164302.20436-1-guro@fb.com Signed-off-by: Roman Gushchin Reported-by: Spock Tested-by: Spock Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Rik van Riel Cc: Randy Dunlap Cc: [4.19.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 9e198f00b64c..35d2108d567c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -730,8 +730,11 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_REMOVED; } - /* recently referenced inodes get one more pass */ - if (inode->i_state & I_REFERENCED) { + /* + * Recently referenced inodes and inodes with many attached pages + * get one more pass. + */ + if (inode->i_state & I_REFERENCED || inode->i_data.nrpages > 1) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; From f5f67cc0e0d3d17ee046ba83f831f267767b8554 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 16 Nov 2018 15:08:22 -0800 Subject: [PATCH 08/16] scripts/faddr2line: fix location of start_kernel in comment Fix a source file reference location to the correct path name. Link: http://lkml.kernel.org/r/1d50bd3d-178e-dcd8-779f-9711887440eb@infradead.org Signed-off-by: Randy Dunlap Acked-by: Josh Poimboeuf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/faddr2line | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/faddr2line b/scripts/faddr2line index a0149db00be7..6c6439f69a72 100755 --- a/scripts/faddr2line +++ b/scripts/faddr2line @@ -71,7 +71,7 @@ die() { # Try to figure out the source directory prefix so we can remove it from the # addr2line output. HACK ALERT: This assumes that start_kernel() is in -# kernel/init.c! This only works for vmlinux. Otherwise it falls back to +# init/main.c! This only works for vmlinux. Otherwise it falls back to # printing the absolute path. find_dir_prefix() { local objfile=$1 From 5040f8df56fb90c7919f1c9b0b6e54c843437456 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 16 Nov 2018 15:08:25 -0800 Subject: [PATCH 09/16] ocfs2: free up write context when direct IO failed The write context should also be freed even when direct IO failed. Otherwise a memory leak is introduced and entries remain in oi->ip_unwritten_list causing the following BUG later in unlink path: ERROR: bug expression: !list_empty(&oi->ip_unwritten_list) ERROR: Clear inode of 215043, inode has unwritten extents ... Call Trace: ? __set_current_blocked+0x42/0x68 ocfs2_evict_inode+0x91/0x6a0 [ocfs2] ? bit_waitqueue+0x40/0x33 evict+0xdb/0x1af iput+0x1a2/0x1f7 do_unlinkat+0x194/0x28f SyS_unlinkat+0x1b/0x2f do_syscall_64+0x79/0x1ae entry_SYSCALL_64_after_hwframe+0x151/0x0 This patch also logs, with frequency limit, direct IO failures. Link: http://lkml.kernel.org/r/20181102170632.25921-1-wen.gang.wang@oracle.com Signed-off-by: Wengang Wang Reviewed-by: Junxiao Bi Reviewed-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 12 ++++++++++-- fs/ocfs2/cluster/masklog.h | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index da578ad4c08f..eb1ce30412dc 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2411,8 +2411,16 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (bytes > 0 && private) - ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); + if (bytes <= 0) + mlog_ratelimited(ML_ERROR, "Direct IO failed, bytes = %lld", + (long long)bytes); + if (private) { + if (bytes > 0) + ret = ocfs2_dio_end_io_write(inode, private, offset, + bytes); + else + ocfs2_dio_free_write_ctx(inode, private); + } ocfs2_iocb_clear_rw_locked(iocb); diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 308ea0eb35fd..a396096a5099 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -178,6 +178,15 @@ do { \ ##__VA_ARGS__); \ } while (0) +#define mlog_ratelimited(mask, fmt, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + mlog(mask, fmt, ##__VA_ARGS__); \ +} while (0) + #define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ From 78179556e7605ba07fd3ddba6ab8891fa457b93e Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 16 Nov 2018 15:08:29 -0800 Subject: [PATCH 10/16] mm/gup.c: fix follow_page_mask() kerneldoc comment Commit df06b37ffe5a ("mm/gup: cache dev_pagemap while pinning pages") modified the signature of follow_page_mask() but left the parameter description behind. Update the description to make the code and comments agree again. While at it, update formatting of the return value description to match Documentation/doc-guide/kernel-doc.rst guidelines. Link: http://lkml.kernel.org/r/1541603316-27832-1-git-send-email-rppt@linux.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f76e77a2d34b..aa43620a3270 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -385,11 +385,17 @@ static struct page *follow_p4d_mask(struct vm_area_struct *vma, * @vma: vm_area_struct mapping @address * @address: virtual address to look up * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page + * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a + * pointer to output page_mask * * @flags can have FOLL_ flags set, defined in * - * Returns the mapped (struct page *), %NULL if no mapping exists, or + * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches + * the device's dev_pagemap metadata to avoid repeating expensive lookups. + * + * On output, the @ctx->page_mask is set according to the size of the page. + * + * Return: the mapped (struct page *), %NULL if no mapping exists, or * an error pointer if there is a mapping to something not represented * by a page descriptor (see also vm_normal_page()). */ From 13c9aaf7fa01cc7600c61981609feadeef3354ec Mon Sep 17 00:00:00 2001 From: Janne Huttunen Date: Fri, 16 Nov 2018 15:08:32 -0800 Subject: [PATCH 11/16] mm/vmstat.c: fix NUMA statistics updates Scan through the whole array to see if an update is needed. While we're at it, use sizeof() to be safe against any possible type changes in the future. The bug here is that we wouldn't sync per-cpu counters into global ones if there was an update of numa_stats for higher cpus. Highly theoretical one though because it is much more probable that zone_stats are updated so we would refresh anyway. So I wouldn't bother to mark this for stable, yet something nice to fix. [mhocko@suse.com: changelog enhancement] Link: http://lkml.kernel.org/r/1541601517-17282-1-git-send-email-janne.huttunen@nokia.com Fixes: 1d90ca897cb0 ("mm: update NUMA counter threshold size") Signed-off-by: Janne Huttunen Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6038ce593ce3..9c624595e904 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1827,12 +1827,13 @@ static bool need_update(int cpu) /* * The fast way of checking if there are any vmstat diffs. - * This works because the diffs are byte sized items. */ - if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS * + sizeof(p->vm_stat_diff[0]))) return true; #ifdef CONFIG_NUMA - if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS)) + if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS * + sizeof(p->vm_numa_stat_diff[0]))) return true; #endif } From 1c23b4108d716cc848b38532063a8aca4f86add8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Nov 2018 15:08:35 -0800 Subject: [PATCH 12/16] lib/ubsan.c: don't mark __ubsan_handle_builtin_unreachable as noreturn gcc-8 complains about the prototype for this function: lib/ubsan.c:432:1: error: ignoring attribute 'noreturn' in declaration of a built-in function '__ubsan_handle_builtin_unreachable' because it conflicts with attribute 'const' [-Werror=attributes] This is actually a GCC's bug. In GCC internals __ubsan_handle_builtin_unreachable() declared with both 'noreturn' and 'const' attributes instead of only 'noreturn': https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84210 Workaround this by removing the noreturn attribute. [aryabinin: add information about GCC bug in changelog] Link: http://lkml.kernel.org/r/20181107144516.4587-1-aryabinin@virtuozzo.com Signed-off-by: Arnd Bergmann Signed-off-by: Andrey Ryabinin Acked-by: Olof Johansson Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ubsan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/ubsan.c b/lib/ubsan.c index 59fee96c29a0..e4162f59a81c 100644 --- a/lib/ubsan.c +++ b/lib/ubsan.c @@ -427,8 +427,7 @@ void __ubsan_handle_shift_out_of_bounds(struct shift_out_of_bounds_data *data, EXPORT_SYMBOL(__ubsan_handle_shift_out_of_bounds); -void __noreturn -__ubsan_handle_builtin_unreachable(struct unreachable_data *data) +void __ubsan_handle_builtin_unreachable(struct unreachable_data *data) { unsigned long flags; From 1a413646931cb14442065cfc17561e50f5b5bb44 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Fri, 16 Nov 2018 15:08:39 -0800 Subject: [PATCH 13/16] tmpfs: make lseek(SEEK_DATA/SEK_HOLE) return ENXIO with a negative offset Other filesystems such as ext4, f2fs and ubifs all return ENXIO when lseek (SEEK_DATA or SEEK_HOLE) requests a negative offset. man 2 lseek says : EINVAL whence is not valid. Or: the resulting file offset would be : negative, or beyond the end of a seekable device. : : ENXIO whence is SEEK_DATA or SEEK_HOLE, and the file offset is beyond : the end of the file. Make tmpfs return ENXIO under these circumstances as well. After this, tmpfs also passes xfstests's generic/448. [akpm@linux-foundation.org: rewrite changelog] Link: http://lkml.kernel.org/r/1540434176-14349-1-git-send-email-yuyufen@huawei.com Signed-off-by: Yufen Yu Reviewed-by: Andrew Morton Cc: Al Viro Cc: Hugh Dickins Cc: William Kucharski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ea26d7a0342d..d44991ea5ed4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2563,9 +2563,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ - if (offset < 0) - offset = -EINVAL; - else if (offset >= inode->i_size) + if (offset < 0 || offset >= inode->i_size) offset = -ENXIO; else { start = offset >> PAGE_SHIFT; From 6f4d29df66acd49303a99025046b85cabe7aa17a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 16 Nov 2018 15:08:43 -0800 Subject: [PATCH 14/16] scripts/spdxcheck.py: make python3 compliant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without this change the following happens when using Python3 (3.6.6): $ echo "GPL-2.0" | python3 scripts/spdxcheck.py - FAIL: 'str' object has no attribute 'decode' Traceback (most recent call last): File "scripts/spdxcheck.py", line 253, in parser.parse_lines(sys.stdin, args.maxlines, '-') File "scripts/spdxcheck.py", line 171, in parse_lines line = line.decode(locale.getpreferredencoding(False), errors='ignore') AttributeError: 'str' object has no attribute 'decode' So as the line is already a string, there is no need to decode it and the line can be dropped. /usr/bin/python on Arch is Python 3. So this would indeed be worth going into 4.19. Link: http://lkml.kernel.org/r/20181023070802.22558-1-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König Cc: Thomas Gleixner Cc: Joe Perches Cc: Greg Kroah-Hartman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/spdxcheck.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/spdxcheck.py b/scripts/spdxcheck.py index 839e190bbd7a..5056fb3b897d 100755 --- a/scripts/spdxcheck.py +++ b/scripts/spdxcheck.py @@ -168,7 +168,6 @@ class id_parser(object): self.curline = 0 try: for line in fd: - line = line.decode(locale.getpreferredencoding(False), errors='ignore') self.curline += 1 if self.curline > maxlines: break From c63ae43ba53bc432b414fd73dd5f4b01fcb1ab43 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 16 Nov 2018 15:08:53 -0800 Subject: [PATCH 15/16] mm, page_alloc: check for max order in hot path Konstantin has noticed that kvmalloc might trigger the following warning: WARNING: CPU: 0 PID: 6676 at mm/vmstat.c:986 __fragmentation_index+0x54/0x60 [...] Call Trace: fragmentation_index+0x76/0x90 compaction_suitable+0x4f/0xf0 shrink_node+0x295/0x310 node_reclaim+0x205/0x250 get_page_from_freelist+0x649/0xad0 __alloc_pages_nodemask+0x12a/0x2a0 kmalloc_large_node+0x47/0x90 __kmalloc_node+0x22b/0x2e0 kvmalloc_node+0x3e/0x70 xt_alloc_table_info+0x3a/0x80 [x_tables] do_ip6t_set_ctl+0xcd/0x1c0 [ip6_tables] nf_setsockopt+0x44/0x60 SyS_setsockopt+0x6f/0xc0 do_syscall_64+0x67/0x120 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 the problem is that we only check for an out of bound order in the slow path and the node reclaim might happen from the fast path already. This is fixable by making sure that kvmalloc doesn't ever use kmalloc for requests that are larger than KMALLOC_MAX_SIZE but this also shows that the code is rather fragile. A recent UBSAN report just underlines that by the following report UBSAN: Undefined behaviour in mm/page_alloc.c:3117:19 shift exponent 51 is too large for 32-bit type 'int' CPU: 0 PID: 6520 Comm: syz-executor1 Not tainted 4.19.0-rc2 #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xd2/0x148 lib/dump_stack.c:113 ubsan_epilogue+0x12/0x94 lib/ubsan.c:159 __ubsan_handle_shift_out_of_bounds+0x2b6/0x30b lib/ubsan.c:425 __zone_watermark_ok+0x2c7/0x400 mm/page_alloc.c:3117 zone_watermark_fast mm/page_alloc.c:3216 [inline] get_page_from_freelist+0xc49/0x44c0 mm/page_alloc.c:3300 __alloc_pages_nodemask+0x21e/0x640 mm/page_alloc.c:4370 alloc_pages_current+0xcc/0x210 mm/mempolicy.c:2093 alloc_pages include/linux/gfp.h:509 [inline] __get_free_pages+0x12/0x60 mm/page_alloc.c:4414 dma_mem_alloc+0x36/0x50 arch/x86/include/asm/floppy.h:156 raw_cmd_copyin drivers/block/floppy.c:3159 [inline] raw_cmd_ioctl drivers/block/floppy.c:3206 [inline] fd_locked_ioctl+0xa00/0x2c10 drivers/block/floppy.c:3544 fd_ioctl+0x40/0x60 drivers/block/floppy.c:3571 __blkdev_driver_ioctl block/ioctl.c:303 [inline] blkdev_ioctl+0xb3c/0x1a30 block/ioctl.c:601 block_ioctl+0x105/0x150 fs/block_dev.c:1883 vfs_ioctl fs/ioctl.c:46 [inline] do_vfs_ioctl+0x1c0/0x1150 fs/ioctl.c:687 ksys_ioctl+0x9e/0xb0 fs/ioctl.c:702 __do_sys_ioctl fs/ioctl.c:709 [inline] __se_sys_ioctl fs/ioctl.c:707 [inline] __x64_sys_ioctl+0x7e/0xc0 fs/ioctl.c:707 do_syscall_64+0xc4/0x510 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe Note that this is not a kvmalloc path. It is just that the fast path really depends on having sanitzed order as well. Therefore move the order check to the fast path. Link: http://lkml.kernel.org/r/20181113094305.GM15120@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Konstantin Khlebnikov Reported-by: Kyungtae Kim Acked-by: Vlastimil Babka Cc: Balbir Singh Cc: Mel Gorman Cc: Pavel Tatashin Cc: Oscar Salvador Cc: Mike Rapoport Cc: Aaron Lu Cc: Joonsoo Kim Cc: Byoungyoung Lee Cc: "Dae R. Jeong" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a0e5ec0addd2..6847177dc4a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4060,17 +4060,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned int cpuset_mems_cookie; int reserve_flags; - /* - * In the slowpath, we sanity check order to avoid ever trying to - * reclaim >= MAX_ORDER areas which will never succeed. Callers may - * be using allocators in order of preference for an area that is - * too large. - */ - if (order >= MAX_ORDER) { - WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); - return NULL; - } - /* * We also sanity check to catch abuse of atomic reserves being used by * callers that are not in atomic context. @@ -4364,6 +4353,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { }; + /* + * There are several places where we assume that the order value is sane + * so bail out early if the request is out of bound. + */ + if (unlikely(order >= MAX_ORDER)) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + return NULL; + } + gfp_mask &= gfp_allowed_mask; alloc_mask = gfp_mask; if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) From 45e79815b89149dc6698e71b587c86ffaf4062aa Mon Sep 17 00:00:00 2001 From: Chen Chang Date: Fri, 16 Nov 2018 15:08:57 -0800 Subject: [PATCH 16/16] mm/memblock.c: fix a typo in __next_mem_pfn_range() comments Link: http://lkml.kernel.org/r/20181107100247.13359-1-rainccrun@gmail.com Signed-off-by: Chen Chang Acked-by: Michal Hocko Acked-by: Mike Rapoport Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memblock.c b/mm/memblock.c index 7df468c8ebc8..9a2d5ae81ae1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1179,7 +1179,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP /* - * Common iterator interface used to define for_each_mem_range(). + * Common iterator interface used to define for_each_mem_pfn_range(). */ void __init_memblock __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,