From dc78327c0ea7da5186d8cbc1647bd6088c5c9fa5 Mon Sep 17 00:00:00 2001 From: Michal Nazarewicz Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: [PATCH 01/14] mm: page_alloc: fix CMA area initialisation when pageblock > MAX_ORDER MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With a kernel configured with ARM64_64K_PAGES && !TRANSPARENT_HUGEPAGE, the following is triggered at early boot: SMP: Total of 8 processors activated. devtmpfs: initialized Unable to handle kernel NULL pointer dereference at virtual address 00000008 pgd = fffffe0000050000 [00000008] *pgd=00000043fba00003, *pmd=00000043fba00003, *pte=00e0000078010407 Internal error: Oops: 96000006 [#1] SMP Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.15.0-rc864k+ #44 task: fffffe03bc040000 ti: fffffe03bc080000 task.ti: fffffe03bc080000 PC is at __list_add+0x10/0xd4 LR is at free_one_page+0x270/0x638 ... Call trace: __list_add+0x10/0xd4 free_one_page+0x26c/0x638 __free_pages_ok.part.52+0x84/0xbc __free_pages+0x74/0xbc init_cma_reserved_pageblock+0xe8/0x104 cma_init_reserved_areas+0x190/0x1e4 do_one_initcall+0xc4/0x154 kernel_init_freeable+0x204/0x2a8 kernel_init+0xc/0xd4 This happens because init_cma_reserved_pageblock() calls __free_one_page() with pageblock_order as page order but it is bigger than MAX_ORDER. This in turn causes accesses past zone->free_list[]. Fix the problem by changing init_cma_reserved_pageblock() such that it splits pageblock into individual MAX_ORDER pages if pageblock is bigger than a MAX_ORDER page. In cases where !CONFIG_HUGETLB_PAGE_SIZE_VARIABLE, which is all architectures expect for ia64, powerpc and tile at the moment, the “pageblock_order > MAX_ORDER” condition will be optimised out since both sides of the operator are constants. In cases where pageblock size is variable, the performance degradation should not be significant anyway since init_cma_reserved_pageblock() is called only at boot time at most MAX_CMA_AREAS times which by default is eight. Signed-off-by: Michal Nazarewicz Reported-by: Mark Salter Tested-by: Mark Salter Tested-by: Christopher Covington Cc: Mel Gorman Cc: David Rientjes Cc: Marek Szyprowski Cc: Catalin Marinas Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20d17f8266fe..0ea758b898fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -816,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_count(p, 0); } while (++p, --i); - set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); - __free_pages(page, pageblock_order); + + if (pageblock_order >= MAX_ORDER) { + i = pageblock_nr_pages; + p = page; + do { + set_page_refcounted(p); + __free_pages(p, MAX_ORDER - 1); + p += MAX_ORDER_NR_PAGES; + } while (i -= MAX_ORDER_NR_PAGES); + } else { + set_page_refcounted(page); + __free_pages(page, pageblock_order); + } + adjust_managed_page_count(page, pageblock_nr_pages); } #endif From 8a5b20aebaa3d0ade5b8381e64d35fb777b7b355 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: [PATCH 02/14] slub: fix off by one in number of slab tests min_partial means minimum number of slab cached in node partial list. So, if nr_partial is less than it, we keep newly empty slab on node partial list rather than freeing it. But if nr_partial is equal or greater than it, it means that we have enough partial slabs so should free newly empty slab. Current implementation missed the equal case so if we set min_partial is 0, then, at least one slab could be cached. This is critical problem to kmemcg destroying logic because it doesn't works properly if some slabs is cached. This patch fixes this problem. Fixes 91cb69620284 ("slub: make dead memcg caches discard free slabs immediately"). Signed-off-by: Joonsoo Kim Acked-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index b2b047327d76..73004808537e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1881,7 +1881,7 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial > s->min_partial) + if (!new.inuse && n->nr_partial >= s->min_partial) m = M_FREE; else if (new.freelist) { m = M_PARTIAL; @@ -1992,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s, new.freelist, new.counters, "unfreezing slab")); - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { page->next = discard_page; discard_page = page; } else { @@ -2620,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, return; } - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* From 571ff4731bc9e5811080f7d16d24c3af7cbca142 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: [PATCH 03/14] autofs4: fix false positive compile error On strict build environments we can see: fs/autofs4/inode.c: In function 'autofs4_fill_super': fs/autofs4/inode.c:312: error: 'pgrp' may be used uninitialized in this function make[2]: *** [fs/autofs4/inode.o] Error 1 make[1]: *** [fs/autofs4] Error 2 make: *** [fs] Error 2 make: *** Waiting for unfinished jobs.... This is due to the use of pgrp_set being used to indicate pgrp has has been set rather than initializing pgrp itself. Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d7bd395ab586..1c55388ae633 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -210,7 +210,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) int pipefd; struct autofs_sb_info *sbi; struct autofs_info *ino; - int pgrp; + int pgrp = 0; bool pgrp_set = false; int ret = -EINVAL; From 1bd702e665d5c2bacc735bdaf480cf7c220319c0 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 2 Jul 2014 15:22:35 -0700 Subject: [PATCH 04/14] tools: cpu-hotplug fix unexpected operator error on-off-test uses "$UID != 0" to test for root, but $UID is a construct specific to bash. Using /bin/sh that isn't bash results in the following error (due to the "$UID" part expanding to nothing): ./on-off-test.sh: 9: [: !=: unexpected operator Change Makefile to use bash instead. Signed-off-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/cpu-hotplug/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cpu-hotplug/Makefile b/tools/testing/selftests/cpu-hotplug/Makefile index ae5faf9aade2..790c23a9db44 100644 --- a/tools/testing/selftests/cpu-hotplug/Makefile +++ b/tools/testing/selftests/cpu-hotplug/Makefile @@ -1,6 +1,6 @@ all: run_tests: - @/bin/sh ./on-off-test.sh || echo "cpu-hotplug selftests: [FAIL]" + @/bin/bash ./on-off-test.sh || echo "cpu-hotplug selftests: [FAIL]" clean: From e98f776224301141f79b2b1e8d01633429a5dbfe Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 2 Jul 2014 15:22:36 -0700 Subject: [PATCH 05/14] tools: memory-hotplug fix unexpected operator error on-off-test uses "$UID != 0" to test for root, but $UID is a construct specific to bash. Using /bin/sh that isn't bash results in the following error (due to the "$UID" part expanding to nothing): ./on-off-test.sh: 9: [: !=: unexpected operator Change Makefile to use bash instead. Signed-off-by: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memory-hotplug/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/memory-hotplug/Makefile b/tools/testing/selftests/memory-hotplug/Makefile index 350bfeda3aa8..058c76f5d102 100644 --- a/tools/testing/selftests/memory-hotplug/Makefile +++ b/tools/testing/selftests/memory-hotplug/Makefile @@ -1,6 +1,6 @@ all: run_tests: - @/bin/sh ./on-off-test.sh || echo "memory-hotplug selftests: [FAIL]" + @/bin/bash ./on-off-test.sh || echo "memory-hotplug selftests: [FAIL]" clean: From 2e32baea46ce542c561a519414c840295b229c8f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 2 Jul 2014 15:22:36 -0700 Subject: [PATCH 06/14] zram: revalidate disk after capacity change Alexander reported mkswap on /dev/zram0 is failed if other process is opening the block device file. Step is as follows, 0. Reset the unused zram device. 1. Use a program that opens /dev/zram0 with O_RDWR and sleeps until killed. 2. While that program sleeps, echo the correct value to /sys/block/zram0/disksize. 3. Verify (e.g. in /proc/partitions) that the disk size is applied correctly. It is. 4. While that program still sleeps, attempt to mkswap /dev/zram0. This fails: mkswap: error: swap area needs to be at least 40 KiB When I investigated, the size get by ioctl(fd, BLKGETSIZE64, xxx) on mkswap to get a size of blockdev was zero although zram0 has right size by 2. The reason is zram didn't revalidate disk after changing capacity so that size of blockdev's inode is not uptodate until all of file is close. This patch should fix the BUG. Signed-off-by: Minchan Kim Reported-by: Alexander E. Patrakov Tested-by: Alexander E. Patrakov Reviewed-by: Sergey Senozhatsky Cc: Nitin Gupta Acked-by: Jerome Marchand Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 48eccb350180..089e72cd37be 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -622,8 +622,10 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - if (reset_capacity) + if (reset_capacity) { set_capacity(zram->disk, 0); + revalidate_disk(zram->disk); + } up_write(&zram->init_lock); } @@ -664,6 +666,7 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); + revalidate_disk(zram->disk); up_write(&zram->init_lock); return len; From 496a8e68654a5f42db90c650be305dcb50bfebdb Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 2 Jul 2014 15:22:36 -0700 Subject: [PATCH 07/14] msync: fix incorrect fstart calculation Fix a regression caused by 7fc34a62ca44 ("mm/msync.c: sync only the requested range in msync()"). xfstests generic/075 fail occured on ext4 data=journal mode because the intended range was not syncing due to wrong fstart calculation. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Reported-by: Eric Whitney Tested-by: Eric Whitney Acked-by: Matthew Wilcox Reviewed-by: Lukas Czerner Tested-by: Lukas Czerner Reviewed-by: Theodore Ts'o Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/msync.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/msync.c b/mm/msync.c index a5c673669ca6..992a1673d488 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -78,7 +78,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; } file = vma->vm_file; - fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fstart = (start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); fend = fstart + (min(end, vma->vm_end) - start) - 1; start = vma->vm_end; if ((flags & MS_SYNC) && file && From b27ebf77919fdc4e7f76b1972307c30c4a3c8859 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 2 Jul 2014 15:22:36 -0700 Subject: [PATCH 08/14] mm:vmscan: update the trace-vmscan-postprocess.pl for event vmscan/mm_vmscan_lru_isolate When using trace-vmscan-postprocess.pl for checking the file/anon rate of scanning, we can find that it can not be performed. At the same time, the following message will be reported: WARNING: Format not as expected for event vmscan/mm_vmscan_lru_isolate 'file' != 'contig_taken' Fewer fields than expected in format at ./trace-vmscan-postprocess.pl line 171, line 76. In trace-vmscan-postprocess.pl, (contig_taken, contig_dirty, and contig_failed) are be associated respectively to (nr_lumpy_taken, nr_lumpy_dirty, and nr_lumpy_failed) for lumpy reclaim. Via commit c53919adc045 ("mm: vmscan: remove lumpy reclaim"), lumpy reclaim had already been removed by Mel, but the update for trace-vmscan-postprocess.pl was missed. Signed-off-by: Chen Yucong Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 00e425faa2fd..78c9a7b2b58f 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -47,7 +47,6 @@ use constant HIGH_KSWAPD_REWAKEUP => 21; use constant HIGH_NR_SCANNED => 22; use constant HIGH_NR_TAKEN => 23; use constant HIGH_NR_RECLAIMED => 24; -use constant HIGH_NR_CONTIG_DIRTY => 25; my %perprocesspid; my %perprocess; @@ -105,7 +104,7 @@ my $regex_direct_end_default = 'nr_reclaimed=([0-9]*)'; my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; -my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)'; +my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) file=([0-9]*)'; my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -200,7 +199,7 @@ $regex_lru_isolate = generate_traceevent_regex( $regex_lru_isolate_default, "isolate_mode", "order", "nr_requested", "nr_scanned", "nr_taken", - "contig_taken", "contig_dirty", "contig_failed"); + "file"); $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, @@ -375,7 +374,6 @@ EVENT_PROCESS: } my $isolate_mode = $1; my $nr_scanned = $4; - my $nr_contig_dirty = $7; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -385,7 +383,6 @@ EVENT_PROCESS: if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; } - $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty; } elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") { $details = $6; if ($details !~ /$regex_lru_shrink_inactive/o) { @@ -539,13 +536,6 @@ sub dump_stats { } } } - if ($stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY}) { - print " "; - my $count = $stats{$process_pid}->{HIGH_NR_CONTIG_DIRTY}; - if ($count != 0) { - print "contig-dirty=$count "; - } - } print "\n"; } From 0bc1f8b0682caa39f45ce1e0228ebf43acb46111 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: [PATCH 09/14] hwpoison: fix the handling path of the victimized page frame that belong to non-LRU Until now, the kernel has the same policy to handle victimized page frames that belong to kernel-space(reserved/slab-subsystem) or non-LRU(unknown page state). In other word, the result of handling either of these victimized page frames is (IGNORED | FAILED), and the return value of memory_failure() is -EBUSY. This patch is to avoid that memory_failure() returns very soon due to the "true" value of (!PageLRU(p)), and it also ensures that action_result() can report more precise information("reserved kernel", "kernel slab", and "unknown page state") instead of "non LRU", especially for memory errors which are detected by memory-scrubbing. Andi said: : While running the mcelog test suite on 3.14 I hit the following VM_BUG_ON: : : soft_offline: 0x56d4: unknown non LRU page type 3ffff800008000 : page:ffffea000015b400 count:3 mapcount:2097169 mapping: (null) index:0xffff8800056d7000 : page flags: 0x3ffff800004081(locked|slab|head) : ------------[ cut here ]------------ : kernel BUG at mm/rmap.c:1495! : : I think what happened is that a LRU page turned into a slab page in : parallel with offlining. memory_failure initially tests for this case, : but doesn't retest later after the page has been locked. : : ... : : I ran this patch in a loop over night with some stress plus : the mcelog test suite running in a loop. I cannot guarantee it hit it, : but it should have given it a good beating. : : The kernel survived with no messages, although the mcelog test suite : got killed at some point because it couldn't fork anymore. Probably : some unrelated problem. : : So the patch is ok for me for .16. Signed-off-by: Chen Yucong Acked-by: Naoya Horiguchi Reported-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index cd8989c1027e..c6399e328931 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct page *hpage = *hpagep; struct page *ppage; - if (PageReserved(p) || PageSlab(p)) + if (PageReserved(p) || PageSlab(p) || !PageLRU(p)) return SWAP_SUCCESS; /* @@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) action_result(pfn, "free buddy, 2nd try", DELAYED); return 0; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } } @@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) return 0; } + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + /* * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned @@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) goto out; } +identify_page_state: res = -EBUSY; /* * The first check uses the current page flags which may not have any From f74373a5cc7a0155d232c4e999648c7a95435bb2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: [PATCH 10/14] /proc/stat: convert to single_open_size() These two patches are supposed to "fix" failed order-4 memory allocations which have been observed when reading /proc/stat. The problem has been observed on s390 as well as on x86. To address the problem change the seq_file memory allocations to fallback to use vmalloc, so that allocations also work if memory is fragmented. This approach seems to be simpler and less intrusive than changing /proc/stat to use an interator. Also it "fixes" other users as well, which use seq_file's single_open() interface. This patch (of 2): Use seq_file's single_open_size() to preallocate a buffer that is large enough to hold the whole output, instead of open coding it. Also calculate the requested size using the number of online cpus instead of possible cpus, since the size of the output only depends on the number of online cpus. Signed-off-by: Heiko Carstens Acked-by: David Rientjes Cc: Ian Kent Cc: Hendrik Brueckner Cc: Thorsten Diehl Cc: Andrea Righi Cc: Christoph Hellwig Cc: Al Viro Cc: Stefan Bader Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/stat.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 9d231e9e5f0e..bf2d03f8fd3e 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -184,29 +184,11 @@ static int show_stat(struct seq_file *p, void *v) static int stat_open(struct inode *inode, struct file *file) { - size_t size = 1024 + 128 * num_possible_cpus(); - char *buf; - struct seq_file *m; - int res; + size_t size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; - - /* don't ask for more than the kmalloc() max size */ - if (size > KMALLOC_MAX_SIZE) - size = KMALLOC_MAX_SIZE; - buf = kmalloc(size, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - res = single_open(file, show_stat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = ksize(buf); - } else - kfree(buf); - return res; + return single_open_size(file, show_stat, NULL, size); } static const struct file_operations proc_stat_operations = { From 058504edd02667eef8fac9be27ab3ea74332e9b4 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: [PATCH 11/14] fs/seq_file: fallback to vmalloc allocation There are a couple of seq_files which use the single_open() interface. This interface requires that the whole output must fit into a single buffer. E.g. for /proc/stat allocation failures have been observed because an order-4 memory allocation failed due to memory fragmentation. In such situations reading /proc/stat is not possible anymore. Therefore change the seq_file code to fallback to vmalloc allocations which will usually result in a couple of order-0 allocations and hence also work if memory is fragmented. For reference a call trace where reading from /proc/stat failed: sadc: page allocation failure: order:4, mode:0x1040d0 CPU: 1 PID: 192063 Comm: sadc Not tainted 3.10.0-123.el7.s390x #1 [...] Call Trace: show_stack+0x6c/0xe8 warn_alloc_failed+0xd6/0x138 __alloc_pages_nodemask+0x9da/0xb68 __get_free_pages+0x2e/0x58 kmalloc_order_trace+0x44/0xc0 stat_open+0x5a/0xd8 proc_reg_open+0x8a/0x140 do_dentry_open+0x1bc/0x2c8 finish_open+0x46/0x60 do_last+0x382/0x10d0 path_openat+0xc8/0x4f8 do_filp_open+0x46/0xa8 do_sys_open+0x114/0x1f0 sysc_tracego+0x14/0x1a Signed-off-by: Heiko Carstens Tested-by: David Rientjes Cc: Ian Kent Cc: Hendrik Brueckner Cc: Thorsten Diehl Cc: Andrea Righi Cc: Christoph Hellwig Cc: Al Viro Cc: Stefan Bader Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/fs/seq_file.c b/fs/seq_file.c index 1d641bb108d2..3857b720cb1b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,8 +8,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -30,6 +32,16 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } +static void *seq_buf_alloc(unsigned long size) +{ + void *buf; + + buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); + if (!buf && size > PAGE_SIZE) + buf = vmalloc(size); + return buf; +} + /** * seq_open - initialize sequential file * @file: file we initialize @@ -96,7 +108,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } @@ -135,9 +147,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -192,7 +204,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) goto Enomem; } @@ -232,9 +244,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kfree(m->buf); + kvfree(m->buf); m->count = 0; - m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + m->buf = seq_buf_alloc(m->size <<= 1); if (!m->buf) goto Enomem; m->version = 0; @@ -350,7 +362,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kfree(m->buf); + kvfree(m->buf); kfree(m); return 0; } @@ -605,13 +617,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = kmalloc(size, GFP_KERNEL); + char *buf = seq_buf_alloc(size); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kfree(buf); + kvfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; From e84f1ab33c76157fd061352b56c4fac9e0f2e8e1 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 2 Jul 2014 15:22:37 -0700 Subject: [PATCH 12/14] tools/testing/selftests/ipc/msgque.c: improve error handling when not running as root The test fails in the middle when it is not run as root while accessing /proc/sys/kernel/msg_next_id. Changed it to check for root at the beginning of the test and exit if not root. Signed-off-by: Shuah Khan Cc: Greg Kroah-Hartman Cc: Davidlohr Bueso Cc: Colin Ian King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/ipc/msgque.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/testing/selftests/ipc/msgque.c b/tools/testing/selftests/ipc/msgque.c index aa290c0de6f5..552f0810bffb 100644 --- a/tools/testing/selftests/ipc/msgque.c +++ b/tools/testing/selftests/ipc/msgque.c @@ -193,6 +193,11 @@ int main(int argc, char **argv) int msg, pid, err; struct msgque_data msgque; + if (getuid() != 0) { + printf("Please run the test as root - Exiting.\n"); + exit(1); + } + msgque.key = ftok(argv[0], 822155650); if (msgque.key == -1) { printf("Can't make key\n"); From d18bbc215f81710e1eab7120becafa910554d68d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 2 Jul 2014 15:22:38 -0700 Subject: [PATCH 13/14] kernel/printk/printk.c: revert "printk: enable interrupts before calling console_trylock_for_printk()" Revert commit 939f04bec1a4 ("printk: enable interrupts before calling console_trylock_for_printk()"). Andreas reported: : None of the post 3.15 kernel boot for me. They all hang at the GRUB : screen telling me it loaded and started the kernel, but the kernel : itself stops before it prints anything (or even replaces the GRUB : background graphics). 939f04bec1a4 is modest latency reduction. Revert it until we understand the reason for these failures. Reported-by: Andreas Bombe Cc: Jan Kara Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 44 +++++++++++++++++------------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index ea2d5f6962ed..13e839dbca07 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1416,9 +1416,10 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(void) +static int console_trylock_for_printk(unsigned int cpu) { - unsigned int cpu = smp_processor_id(); - if (!console_trylock()) return 0; /* @@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - local_irq_restore(flags); - return 0; + goto out_restore_irqs; } zap_locks(); } @@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); - lockdep_on(); - local_irq_restore(flags); /* If called from the scheduler, we can not call up(). */ - if (in_sched) - return printed_len; - - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to console - */ - preempt_disable(); - /* - * Try to acquire and then immediately release the console semaphore. - * The release will print out buffers and wake up /dev/kmsg and syslog() - * users. - */ - if (console_trylock_for_printk()) - console_unlock(); - preempt_enable(); + if (!in_sched) { + /* + * Try to acquire and then immediately release the console + * semaphore. The release will print out buffers and wake up + * /dev/kmsg and syslog() users. + */ + if (console_trylock_for_printk(this_cpu)) + console_unlock(); + } + lockdep_on(); +out_restore_irqs: + local_irq_restore(flags); return printed_len; } EXPORT_SYMBOL(vprintk_emit); From 66d2f4d28cd030220e7ea2a628993fcabcb956d1 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 2 Jul 2014 15:22:38 -0700 Subject: [PATCH 14/14] shmem: fix init_page_accessed use to stop !PageLRU bug Under shmem swapping load, I sometimes hit the VM_BUG_ON_PAGE(!PageLRU) in isolate_lru_pages() at mm/vmscan.c:1281! Commit 2457aec63745 ("mm: non-atomically mark page accessed during page cache allocation where possible") looks like interrupted work-in-progress. mm/filemap.c's call to init_page_accessed() is fine, but not mm/shmem.c's - shmem_write_begin() is clearly wrong to use it after shmem_getpage(), when the page is always visible in radix_tree, and often already on LRU. Revert change to shmem_write_begin(), and use init_page_accessed() or mark_page_accessed() appropriately for SGP_WRITE in shmem_getpage_gfp(). SGP_WRITE also covers shmem_symlink(), which did not mark_page_accessed() before; but since many other filesystems use [__]page_symlink(), which did and does mark the page accessed, consider this as rectifying an oversight. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Michal Hocko Cc: Dave Hansen Cc: Prabhakar Lad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8f419cff9e34..1140f49b6ded 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1029,6 +1029,9 @@ repeat: goto failed; } + if (page && sgp == SGP_WRITE) + mark_page_accessed(page); + /* fallocated page? */ if (page && !PageUptodate(page)) { if (sgp != SGP_READ) @@ -1110,6 +1113,9 @@ repeat: shmem_recalc_inode(inode); spin_unlock(&info->lock); + if (sgp == SGP_WRITE) + mark_page_accessed(page); + delete_from_swap_cache(page); set_page_dirty(page); swap_free(swap); @@ -1136,6 +1142,9 @@ repeat: __SetPageSwapBacked(page); __set_page_locked(page); + if (sgp == SGP_WRITE) + init_page_accessed(page); + error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); if (error) @@ -1412,13 +1421,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret; struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); - if (ret == 0 && *pagep) - init_page_accessed(*pagep); - return ret; + return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } static int