From 3b8967d713d7426e9dd107d065208b84adface91 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 11 Sep 2013 14:19:37 -0700
Subject: [PATCH 001/303] include/linux/smp.h:on_each_cpu(): switch back to a C
 function

Revert commit c846ef7deba2 ("include/linux/smp.h:on_each_cpu(): switch
back to a macro").  It turns out that the problematic linux/irqflags.h
include was fixed within ia64 and mn10300.

Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: David Daney <david.daney@cavium.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index c181399f2c20..c8488763277f 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,6 +11,7 @@
 #include <linux/list.h>
 #include <linux/cpumask.h>
 #include <linux/init.h>
+#include <linux/irqflags.h>
 
 extern void cpu_idle(void);
 
@@ -139,14 +140,17 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info)
 }
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
-#define on_each_cpu(func, info, wait)		\
-	({					\
-		unsigned long __flags;		\
-		local_irq_save(__flags);	\
-		func(info);			\
-		local_irq_restore(__flags);	\
-		0;				\
-	})
+
+static inline int on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	func(info);
+	local_irq_restore(flags);
+	return 0;
+}
+
 /*
  * Note we still need to test the mask even for UP
  * because we actually can get an empty mask from

From e79f525e99b04390ca4d2366309545a836c03bf1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:19:38 -0700
Subject: [PATCH 002/303] pidns: fix vfork() after unshare(CLONE_NEWPID)

Commit 8382fcac1b81 ("pidns: Outlaw thread creation after
unshare(CLONE_NEWPID)") nacks CLONE_VM if the forking process unshared
pid_ns, this obviously breaks vfork:

	int main(void)
	{
		assert(unshare(CLONE_NEWUSER | CLONE_NEWPID) == 0);
		assert(vfork() >= 0);
		_exit(0);
		return 0;
	}

fails without this patch.

Change this check to use CLONE_SIGHAND instead.  This also forbids
CLONE_THREAD automatically, and this is what the comment implies.

We could probably even drop CLONE_SIGHAND and use CLONE_THREAD, but it
would be safer to not do this.  The current check denies CLONE_SIGHAND
implicitely and there is no reason to change this.

Eric said "CLONE_SIGHAND is fine.  CLONE_THREAD would be even better.
Having shared signal handling between two different pid namespaces is
the case that we are fundamentally guarding against."

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Colin Walters <walters@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index c9eaf2013002..3561391ca450 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1173,10 +1173,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		return ERR_PTR(-EINVAL);
 
 	/*
-	 * If the new process will be in a different pid namespace
-	 * don't allow the creation of threads.
+	 * If the new process will be in a different pid namespace don't
+	 * allow it to share a thread group or signal handlers with the
+	 * forking task.
 	 */
-	if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
+	if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) &&
 	    (task_active_pid_ns(current) !=
 	     current->nsproxy->pid_ns_for_children))
 		return ERR_PTR(-EINVAL);

From 5167246a8ad617df55717c2d901da5e2aedffcfa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:19:40 -0700
Subject: [PATCH 003/303] pidns: kill the unnecessary CLONE_NEWPID in
 copy_process()

Commit 8382fcac1b81 ("pidns: Outlaw thread creation after
unshare(CLONE_NEWPID)") nacks CLONE_NEWPID if the forking process
unshared pid_ns.  This is correct but unnecessary, copy_pid_ns() does
the same check.

Remove the CLONE_NEWPID check to cleanup the code and prepare for the
next change.

Test-case:

	static int child(void *arg)
	{
		return 0;
	}

	static char stack[16 * 1024];

	int main(void)
	{
		pid_t pid;

		assert(unshare(CLONE_NEWUSER | CLONE_NEWPID) == 0);

		pid = clone(child, stack + sizeof(stack) / 2,
				CLONE_NEWPID | SIGCHLD, NULL);
		assert(pid < 0 && errno == EINVAL);

		return 0;
	}

clone(CLONE_NEWPID) correctly fails with or without this change.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Colin Walters <walters@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 3561391ca450..68d508f2bfba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1177,9 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * allow it to share a thread group or signal handlers with the
 	 * forking task.
 	 */
-	if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) &&
-	    (task_active_pid_ns(current) !=
-	     current->nsproxy->pid_ns_for_children))
+	if ((clone_flags & CLONE_SIGHAND) && (task_active_pid_ns(current) !=
+					current->nsproxy->pid_ns_for_children))
 		return ERR_PTR(-EINVAL);
 
 	retval = security_task_create(clone_flags);

From 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:19:41 -0700
Subject: [PATCH 004/303] fork: unify and tighten up CLONE_NEWUSER/CLONE_NEWPID
 checks

do_fork() denies CLONE_THREAD | CLONE_PARENT if NEWUSER | NEWPID.

Then later copy_process() denies CLONE_SIGHAND if the new process will
be in a different pid namespace (task_active_pid_ns() doesn't match
current->nsproxy->pid_ns).

This looks confusing and inconsistent.  CLONE_NEWPID is very similar to
the case when ->pid_ns was already unshared, we want the same
restrictions so copy_process() should also nack CLONE_PARENT.

And it would be better to deny CLONE_NEWUSER && CLONE_SIGHAND as well
just for consistency.

Kill the "CLONE_NEWUSER | CLONE_NEWPID" check in do_fork() and change
copy_process() to do the same check along with ->pid_ns check we already
have.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Colin Walters <walters@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 68d508f2bfba..84703db06cf3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1173,13 +1173,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		return ERR_PTR(-EINVAL);
 
 	/*
-	 * If the new process will be in a different pid namespace don't
-	 * allow it to share a thread group or signal handlers with the
-	 * forking task.
+	 * If the new process will be in a different pid or user namespace
+	 * do not allow it to share a thread group or signal handlers or
+	 * parent with the forking task.
 	 */
-	if ((clone_flags & CLONE_SIGHAND) && (task_active_pid_ns(current) !=
-					current->nsproxy->pid_ns_for_children))
-		return ERR_PTR(-EINVAL);
+	if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) {
+		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
+		    (task_active_pid_ns(current) !=
+				current->nsproxy->pid_ns_for_children))
+			return ERR_PTR(-EINVAL);
+	}
 
 	retval = security_task_create(clone_flags);
 	if (retval)
@@ -1575,15 +1578,6 @@ long do_fork(unsigned long clone_flags,
 	int trace = 0;
 	long nr;
 
-	/*
-	 * Do some preliminary argument and permissions checking before we
-	 * actually start allocating stuff
-	 */
-	if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
-		if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
-			return -EINVAL;
-	}
-
 	/*
 	 * Determine whether and which event to report to ptracer.  When
 	 * called from kernel_thread or CLONE_UNTRACED is explicitly

From ffd29195ed720188789a6c7ada5e212b9c59e1af Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Wed, 11 Sep 2013 14:19:42 -0700
Subject: [PATCH 005/303] drivers/video/acornfb.c: remove dead code

acornfb checks for HAS_VIDC while support for that macro was removed in
v2.6.23 (when the arm26 port was removed).  So we can remove a bit of
dead code.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Cc: Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
Cc: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/acornfb.c | 266 +---------------------------------------
 drivers/video/acornfb.h |  29 -----
 2 files changed, 1 insertion(+), 294 deletions(-)

diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index 6488a7351a60..7e8346ec9cdc 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -37,14 +37,6 @@
 
 #include "acornfb.h"
 
-/*
- * VIDC machines can't do 16 or 32BPP modes.
- */
-#ifdef HAS_VIDC
-#undef FBCON_HAS_CFB16
-#undef FBCON_HAS_CFB32
-#endif
-
 /*
  * Default resolution.
  * NOTE that it has to be supported in the table towards
@@ -106,238 +98,6 @@ static struct vidc_timing current_vidc;
 
 extern unsigned int vram_size;	/* set by setup.c */
 
-#ifdef HAS_VIDC
-
-#define MAX_SIZE	480*1024
-
-/* CTL     VIDC	Actual
- * 24.000  0	 8.000
- * 25.175  0	 8.392
- * 36.000  0	12.000
- * 24.000  1	12.000
- * 25.175  1	12.588
- * 24.000  2	16.000
- * 25.175  2	16.783
- * 36.000  1	18.000
- * 24.000  3	24.000
- * 36.000  2	24.000
- * 25.175  3	25.175
- * 36.000  3	36.000
- */
-struct pixclock {
-	u_long	min_clock;
-	u_long	max_clock;
-	u_int	vidc_ctl;
-	u_int	vid_ctl;
-};
-
-static struct pixclock arc_clocks[] = {
-	/* we allow +/-1% on these */
-	{ 123750, 126250, VIDC_CTRL_DIV3,   VID_CTL_24MHz },	/*  8.000MHz */
-	{  82500,  84167, VIDC_CTRL_DIV2,   VID_CTL_24MHz },	/* 12.000MHz */
-	{  61875,  63125, VIDC_CTRL_DIV1_5, VID_CTL_24MHz },	/* 16.000MHz */
-	{  41250,  42083, VIDC_CTRL_DIV1,   VID_CTL_24MHz },	/* 24.000MHz */
-};
-
-static struct pixclock *
-acornfb_valid_pixrate(struct fb_var_screeninfo *var)
-{
-	u_long pixclock = var->pixclock;
-	u_int i;
-
-	if (!var->pixclock)
-		return NULL;
-
-	for (i = 0; i < ARRAY_SIZE(arc_clocks); i++)
-		if (pixclock > arc_clocks[i].min_clock &&
-		    pixclock < arc_clocks[i].max_clock)
-			return arc_clocks + i;
-
-	return NULL;
-}
-
-/* VIDC Rules:
- * hcr  : must be even (interlace, hcr/2 must be even)
- * hswr : must be even
- * hdsr : must be odd
- * hder : must be odd
- *
- * vcr  : must be odd
- * vswr : >= 1
- * vdsr : >= 1
- * vder : >= vdsr
- * if interlaced, then hcr/2 must be even
- */
-static void
-acornfb_set_timing(struct fb_var_screeninfo *var)
-{
-	struct pixclock *pclk;
-	struct vidc_timing vidc;
-	u_int horiz_correction;
-	u_int sync_len, display_start, display_end, cycle;
-	u_int is_interlaced;
-	u_int vid_ctl, vidc_ctl;
-	u_int bandwidth;
-
-	memset(&vidc, 0, sizeof(vidc));
-
-	pclk = acornfb_valid_pixrate(var);
-	vidc_ctl = pclk->vidc_ctl;
-	vid_ctl  = pclk->vid_ctl;
-
-	bandwidth = var->pixclock * 8 / var->bits_per_pixel;
-	/* 25.175, 4bpp = 79.444ns per byte, 317.776ns per word: fifo = 2,6 */
-	if (bandwidth > 143500)
-		vidc_ctl |= VIDC_CTRL_FIFO_3_7;
-	else if (bandwidth > 71750)
-		vidc_ctl |= VIDC_CTRL_FIFO_2_6;
-	else if (bandwidth > 35875)
-		vidc_ctl |= VIDC_CTRL_FIFO_1_5;
-	else
-		vidc_ctl |= VIDC_CTRL_FIFO_0_4;
-
-	switch (var->bits_per_pixel) {
-	case 1:
-		horiz_correction = 19;
-		vidc_ctl |= VIDC_CTRL_1BPP;
-		break;
-
-	case 2:
-		horiz_correction = 11;
-		vidc_ctl |= VIDC_CTRL_2BPP;
-		break;
-
-	case 4:
-		horiz_correction = 7;
-		vidc_ctl |= VIDC_CTRL_4BPP;
-		break;
-
-	default:
-	case 8:
-		horiz_correction = 5;
-		vidc_ctl |= VIDC_CTRL_8BPP;
-		break;
-	}
-
-	if (var->sync & FB_SYNC_COMP_HIGH_ACT) /* should be FB_SYNC_COMP */
-		vidc_ctl |= VIDC_CTRL_CSYNC;
-	else {
-		if (!(var->sync & FB_SYNC_HOR_HIGH_ACT))
-			vid_ctl |= VID_CTL_HS_NHSYNC;
-
-		if (!(var->sync & FB_SYNC_VERT_HIGH_ACT))
-			vid_ctl |= VID_CTL_VS_NVSYNC;
-	}
-
-	sync_len	= var->hsync_len;
-	display_start	= sync_len + var->left_margin;
-	display_end	= display_start + var->xres;
-	cycle		= display_end + var->right_margin;
-
-	/* if interlaced, then hcr/2 must be even */
-	is_interlaced = (var->vmode & FB_VMODE_MASK) == FB_VMODE_INTERLACED;
-
-	if (is_interlaced) {
-		vidc_ctl |= VIDC_CTRL_INTERLACE;
-		if (cycle & 2) {
-			cycle += 2;
-			var->right_margin += 2;
-		}
-	}
-
-	vidc.h_cycle		= (cycle - 2) / 2;
-	vidc.h_sync_width	= (sync_len - 2) / 2;
-	vidc.h_border_start	= (display_start - 1) / 2;
-	vidc.h_display_start	= (display_start - horiz_correction) / 2;
-	vidc.h_display_end	= (display_end - horiz_correction) / 2;
-	vidc.h_border_end	= (display_end - 1) / 2;
-	vidc.h_interlace	= (vidc.h_cycle + 1) / 2;
-
-	sync_len	= var->vsync_len;
-	display_start	= sync_len + var->upper_margin;
-	display_end	= display_start + var->yres;
-	cycle		= display_end + var->lower_margin;
-
-	if (is_interlaced)
-		cycle = (cycle - 3) / 2;
-	else
-		cycle = cycle - 1;
-
-	vidc.v_cycle		= cycle;
-	vidc.v_sync_width	= sync_len - 1;
-	vidc.v_border_start	= display_start - 1;
-	vidc.v_display_start	= vidc.v_border_start;
-	vidc.v_display_end	= display_end - 1;
-	vidc.v_border_end	= vidc.v_display_end;
-
-	if (machine_is_a5k())
-		__raw_writeb(vid_ctl, IOEB_VID_CTL);
-
-	if (memcmp(&current_vidc, &vidc, sizeof(vidc))) {
-		current_vidc = vidc;
-
-		vidc_writel(0xe0000000 | vidc_ctl);
-		vidc_writel(0x80000000 | (vidc.h_cycle << 14));
-		vidc_writel(0x84000000 | (vidc.h_sync_width << 14));
-		vidc_writel(0x88000000 | (vidc.h_border_start << 14));
-		vidc_writel(0x8c000000 | (vidc.h_display_start << 14));
-		vidc_writel(0x90000000 | (vidc.h_display_end << 14));
-		vidc_writel(0x94000000 | (vidc.h_border_end << 14));
-		vidc_writel(0x98000000);
-		vidc_writel(0x9c000000 | (vidc.h_interlace << 14));
-		vidc_writel(0xa0000000 | (vidc.v_cycle << 14));
-		vidc_writel(0xa4000000 | (vidc.v_sync_width << 14));
-		vidc_writel(0xa8000000 | (vidc.v_border_start << 14));
-		vidc_writel(0xac000000 | (vidc.v_display_start << 14));
-		vidc_writel(0xb0000000 | (vidc.v_display_end << 14));
-		vidc_writel(0xb4000000 | (vidc.v_border_end << 14));
-		vidc_writel(0xb8000000);
-		vidc_writel(0xbc000000);
-	}
-#ifdef DEBUG_MODE_SELECTION
-	printk(KERN_DEBUG "VIDC registers for %dx%dx%d:\n", var->xres,
-	       var->yres, var->bits_per_pixel);
-	printk(KERN_DEBUG " H-cycle          : %d\n", vidc.h_cycle);
-	printk(KERN_DEBUG " H-sync-width     : %d\n", vidc.h_sync_width);
-	printk(KERN_DEBUG " H-border-start   : %d\n", vidc.h_border_start);
-	printk(KERN_DEBUG " H-display-start  : %d\n", vidc.h_display_start);
-	printk(KERN_DEBUG " H-display-end    : %d\n", vidc.h_display_end);
-	printk(KERN_DEBUG " H-border-end     : %d\n", vidc.h_border_end);
-	printk(KERN_DEBUG " H-interlace      : %d\n", vidc.h_interlace);
-	printk(KERN_DEBUG " V-cycle          : %d\n", vidc.v_cycle);
-	printk(KERN_DEBUG " V-sync-width     : %d\n", vidc.v_sync_width);
-	printk(KERN_DEBUG " V-border-start   : %d\n", vidc.v_border_start);
-	printk(KERN_DEBUG " V-display-start  : %d\n", vidc.v_display_start);
-	printk(KERN_DEBUG " V-display-end    : %d\n", vidc.v_display_end);
-	printk(KERN_DEBUG " V-border-end     : %d\n", vidc.v_border_end);
-	printk(KERN_DEBUG " VIDC Ctrl (E)    : 0x%08X\n", vidc_ctl);
-	printk(KERN_DEBUG " IOEB Ctrl        : 0x%08X\n", vid_ctl);
-#endif
-}
-
-static int
-acornfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
-		  u_int trans, struct fb_info *info)
-{
-	union palette pal;
-
-	if (regno >= current_par.palette_size)
-		return 1;
-
-	pal.p = 0;
-	pal.vidc.reg   = regno;
-	pal.vidc.red   = red >> 12;
-	pal.vidc.green = green >> 12;
-	pal.vidc.blue  = blue >> 12;
-
-	current_par.palette[regno] = pal;
-
-	vidc_writel(pal.p);
-
-	return 0;
-}
-#endif
-
 #ifdef HAS_VIDC20
 #include <mach/acornfb.h>
 
@@ -634,16 +394,7 @@ acornfb_adjust_timing(struct fb_info *info, struct fb_var_screeninfo *var, u_int
 	/* hsync_len must be even */
 	var->hsync_len = (var->hsync_len + 1) & ~1;
 
-#ifdef HAS_VIDC
-	/* left_margin must be odd */
-	if ((var->left_margin & 1) == 0) {
-		var->left_margin -= 1;
-		var->right_margin += 1;
-	}
-
-	/* right_margin must be odd */
-	var->right_margin |= 1;
-#elif defined(HAS_VIDC20)
+#if defined(HAS_VIDC20)
 	/* left_margin must be even */
 	if (var->left_margin & 1) {
 		var->left_margin += 1;
@@ -787,11 +538,7 @@ static int acornfb_set_par(struct fb_info *info)
 		break;
 	case 8:
 		current_par.palette_size = VIDC_PALETTE_SIZE;
-#ifdef HAS_VIDC
-		info->fix.visual = FB_VISUAL_STATIC_PSEUDOCOLOR;
-#else
 		info->fix.visual = FB_VISUAL_PSEUDOCOLOR;
-#endif
 		break;
 #ifdef HAS_VIDC20
 	case 16:
@@ -971,9 +718,6 @@ static void acornfb_init_fbinfo(void)
 #if defined(HAS_VIDC20)
 	fb_info.var.red.length	   = 8;
 	fb_info.var.transp.length  = 4;
-#elif defined(HAS_VIDC)
-	fb_info.var.red.length	   = 4;
-	fb_info.var.transp.length  = 1;
 #endif
 	fb_info.var.green	   = fb_info.var.red;
 	fb_info.var.blue	   = fb_info.var.red;
@@ -1310,14 +1054,6 @@ static int acornfb_probe(struct platform_device *dev)
 		fb_info.fix.smem_start = handle;
 	}
 #endif
-#if defined(HAS_VIDC)
-	/*
-	 * Archimedes/A5000 machines use a fixed address for their
-	 * framebuffers.  Free unused pages
-	 */
-	free_unused_pages(PAGE_OFFSET + size, PAGE_OFFSET + MAX_SIZE);
-#endif
-
 	fb_info.fix.smem_len = size;
 	current_par.palette_size   = VIDC_PALETTE_SIZE;
 
diff --git a/drivers/video/acornfb.h b/drivers/video/acornfb.h
index fb2a7fffe506..175c8ff3367c 100644
--- a/drivers/video/acornfb.h
+++ b/drivers/video/acornfb.h
@@ -13,10 +13,6 @@
 #include <asm/hardware/iomd.h>
 #define VIDC_PALETTE_SIZE	256
 #define VIDC_NAME		"VIDC20"
-#elif defined(HAS_VIDC)
-#include <asm/hardware/memc.h>
-#define VIDC_PALETTE_SIZE	16
-#define VIDC_NAME		"VIDC"
 #endif
 
 #define EXTEND8(x) ((x)|(x)<<8)
@@ -101,31 +97,6 @@ struct modex_params {
 	const struct modey_params *modey;
 };
 
-#ifdef HAS_VIDC
-
-#define VID_CTL_VS_NVSYNC	(1 << 3)
-#define VID_CTL_HS_NHSYNC	(1 << 2)
-#define VID_CTL_24MHz		(0)
-#define VID_CTL_25MHz		(1)
-#define VID_CTL_36MHz		(2)
-
-#define VIDC_CTRL_CSYNC		(1 << 7)
-#define VIDC_CTRL_INTERLACE	(1 << 6)
-#define VIDC_CTRL_FIFO_0_4	(0 << 4)
-#define VIDC_CTRL_FIFO_1_5	(1 << 4)
-#define VIDC_CTRL_FIFO_2_6	(2 << 4)
-#define VIDC_CTRL_FIFO_3_7	(3 << 4)
-#define VIDC_CTRL_1BPP		(0 << 2)
-#define VIDC_CTRL_2BPP		(1 << 2)
-#define VIDC_CTRL_4BPP		(2 << 2)
-#define VIDC_CTRL_8BPP		(3 << 2)
-#define VIDC_CTRL_DIV3		(0 << 0)
-#define VIDC_CTRL_DIV2		(1 << 0)
-#define VIDC_CTRL_DIV1_5	(2 << 0)
-#define VIDC_CTRL_DIV1		(3 << 0)
-
-#endif
-
 #ifdef HAS_VIDC20
 /*
  * VIDC20 registers

From 5e42781caf6f5f1c77e842d6dcbbf5c51b8b2c29 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:19:43 -0700
Subject: [PATCH 006/303] drivers/iommu: remove unnecessary
 platform_set_drvdata()

The driver core clears the driver data to NULL after device_release or
on probe failure.  Thus, it is not needed to manually clear the device
driver data to NULL.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: David Brown <davidb@codeaurora.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Suman Anna <s-anna@ti.com>
Acked-by: Libo Chen <libo.chen@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/iommu/msm_iommu_dev.c | 2 --
 drivers/iommu/omap-iommu.c    | 2 --
 2 files changed, 4 deletions(-)

diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c
index 0a1c9626aa9e..08ba4972da9d 100644
--- a/drivers/iommu/msm_iommu_dev.c
+++ b/drivers/iommu/msm_iommu_dev.c
@@ -282,7 +282,6 @@ static int msm_iommu_remove(struct platform_device *pdev)
 		clk_put(drv->pclk);
 		memset(drv, 0, sizeof(*drv));
 		kfree(drv);
-		platform_set_drvdata(pdev, NULL);
 	}
 	return 0;
 }
@@ -366,7 +365,6 @@ static int msm_iommu_ctx_remove(struct platform_device *pdev)
 	if (drv) {
 		memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata));
 		kfree(drv);
-		platform_set_drvdata(pdev, NULL);
 	}
 	return 0;
 }
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 0ba3766240d5..bcd78a720630 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1008,8 +1008,6 @@ static int omap_iommu_remove(struct platform_device *pdev)
 	struct resource *res;
 	struct omap_iommu *obj = platform_get_drvdata(pdev);
 
-	platform_set_drvdata(pdev, NULL);
-
 	iopgtable_clear_entry_all(obj);
 
 	irq = platform_get_irq(pdev, 0);

From 2b1e55c389105b722cccadfa47f5615f57d8887f Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 11 Sep 2013 14:19:44 -0700
Subject: [PATCH 007/303] ocfs2: lighten up allocate transaction

The issue scenario is as following:

When fallocating a very large disk space for a small file,
__ocfs2_extend_allocation attempts to get a very large transaction.  For
some journal sizes, there may be not enough room for this transaction,
and the fallocate will fail.

The patch below extends & restarts the transaction as necessary while
allocating space, and should work with even the smallest journal.  This
patch refers ext4 resize.

Test:
# mkfs.ocfs2 -b 4K -C 32K -T datafiles /dev/sdc
...(jounral size is 32M)
# mount.ocfs2 /dev/sdc /mnt/ocfs2/
# touch /mnt/ocfs2/1.log
# fallocate -o 0 -l 400G /mnt/ocfs2/1.log
fallocate: /mnt/ocfs2/1.log: fallocate failed: Cannot allocate memory
# tail -f /var/log/messages
[ 7372.278591] JBD: fallocate wants too many credits (2051 > 2048)
[ 7372.278597] (fallocate,6438,0):__ocfs2_extend_allocation:709 ERROR: status = -12
[ 7372.278603] (fallocate,6438,0):ocfs2_allocate_unwritten_extents:1504 ERROR: status = -12
[ 7372.278607] (fallocate,6438,0):__ocfs2_change_file_space:1955 ERROR: status = -12
^C
With this patch, the test works well.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c        |  6 +-----
 fs/ocfs2/journal.c     | 35 +++++++++++++++++++++++++++++++++++
 fs/ocfs2/journal.h     | 11 +++++++++++
 fs/ocfs2/ocfs2_trace.h |  2 ++
 4 files changed, 49 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3261d71319ee..409c549ae02a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -671,11 +671,7 @@ restarted_transaction:
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 
-			/* TODO: This can be more intelligent. */
-			credits = ocfs2_calc_extend_credits(osb->sb,
-							    &fe->id2.i_list,
-							    clusters_to_add);
-			status = ocfs2_extend_trans(handle, credits);
+			status = ocfs2_allocate_extend_trans(handle, 1);
 			if (status < 0) {
 				/* handle still has to be committed at
 				 * this point. */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 242170d83971..a126cb37ca4d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -455,6 +455,41 @@ bail:
 	return status;
 }
 
+/*
+ * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA.
+ * If that fails, restart the transaction & regain write access for the
+ * buffer head which is used for metadata modifications.
+ * Taken from Ext4: extend_or_restart_transaction()
+ */
+int ocfs2_allocate_extend_trans(handle_t *handle, int thresh)
+{
+	int status, old_nblks;
+
+	BUG_ON(!handle);
+
+	old_nblks = handle->h_buffer_credits;
+	trace_ocfs2_allocate_extend_trans(old_nblks, thresh);
+
+	if (old_nblks < thresh)
+		return 0;
+
+	status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	if (status > 0) {
+		status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA);
+		if (status < 0)
+			mlog_errno(status);
+	}
+
+bail:
+	return status;
+}
+
+
 struct ocfs2_triggers {
 	struct jbd2_buffer_trigger_type	ot_triggers;
 	int				ot_offset;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 0a992737dcaf..0b479bab3671 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -258,6 +258,17 @@ handle_t		    *ocfs2_start_trans(struct ocfs2_super *osb,
 int			     ocfs2_commit_trans(struct ocfs2_super *osb,
 						handle_t *handle);
 int			     ocfs2_extend_trans(handle_t *handle, int nblocks);
+int			     ocfs2_allocate_extend_trans(handle_t *handle,
+						int thresh);
+
+/*
+ * Define an arbitrary limit for the amount of data we will anticipate
+ * writing to any given transaction.  For unbounded transactions such as
+ * fallocate(2) we can write more than this, but we always
+ * start off at the maximum transaction size and grow the transaction
+ * optimistically as we go.
+ */
+#define OCFS2_MAX_TRANS_DATA	64U
 
 /*
  * Create access is for when we get a newly created buffer and we're
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 3b481f490633..1b60c62aa9d6 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2579,6 +2579,8 @@ DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
 
 DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
 
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans);
+
 DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
 
 DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);

From f17c20dd2ec81e8ff328b81bc847da9429d0975b Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Wed, 11 Sep 2013 14:19:45 -0700
Subject: [PATCH 008/303] ocfs2: use i_size_read() to access i_size

Though ocfs2 uses inode->i_mutex to protect i_size, there are both
i_size_read/write() and direct accesses.  Clean up all direct access to
eliminate confusion.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/aops.c         |  2 +-
 fs/ocfs2/extent_map.c   | 10 +++++-----
 fs/ocfs2/ioctl.c        |  2 +-
 fs/ocfs2/journal.c      |  8 ++++----
 fs/ocfs2/move_extents.c |  2 +-
 fs/ocfs2/quota_global.c |  6 +++---
 fs/ocfs2/quota_local.c  | 12 ++++++------
 7 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 94417a85ce6e..f37d3c0e2053 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2044,7 +2044,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
 
 out_write_size:
 	pos += copied;
-	if (pos > inode->i_size) {
+	if (pos > i_size_read(inode)) {
 		i_size_write(inode, pos);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2487116d0d33..4bf2b763467f 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -852,20 +852,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
 
 	down_read(&OCFS2_I(inode)->ip_alloc_sem);
 
-	if (*offset >= inode->i_size) {
+	if (*offset >= i_size_read(inode)) {
 		ret = -ENXIO;
 		goto out_unlock;
 	}
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		if (whence == SEEK_HOLE)
-			*offset = inode->i_size;
+			*offset = i_size_read(inode);
 		goto out_unlock;
 	}
 
 	clen = 0;
 	cpos = *offset >> cs_bits;
-	cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
+	cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
 
 	while (cpos < cend && !is_last) {
 		ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
@@ -904,8 +904,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence)
 		extlen = clen;
 		extlen <<=  cs_bits;
 
-		if ((extoff + extlen) > inode->i_size)
-			extlen = inode->i_size - extoff;
+		if ((extoff + extlen) > i_size_read(inode))
+			extlen = i_size_read(inode) - extoff;
 		extoff += extlen;
 		if (extoff > *offset)
 			*offset = extoff;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 0c60ef2d8056..fa32ce9b455d 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -303,7 +303,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode,
 	if (o2info_from_user(oij, req))
 		goto bail;
 
-	oij.ij_journal_size = osb->journal->j_inode->i_size;
+	oij.ij_journal_size = i_size_read(osb->journal->j_inode);
 
 	o2info_set_request_filled(&oij.ij_req);
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a126cb37ca4d..44fc3e530c3d 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -836,14 +836,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	inode_lock = 1;
 	di = (struct ocfs2_dinode *)bh->b_data;
 
-	if (inode->i_size <  OCFS2_MIN_JOURNAL_SIZE) {
+	if (i_size_read(inode) <  OCFS2_MIN_JOURNAL_SIZE) {
 		mlog(ML_ERROR, "Journal file size (%lld) is too small!\n",
-		     inode->i_size);
+		     i_size_read(inode));
 		status = -EINVAL;
 		goto done;
 	}
 
-	trace_ocfs2_journal_init(inode->i_size,
+	trace_ocfs2_journal_init(i_size_read(inode),
 				 (unsigned long long)inode->i_blocks,
 				 OCFS2_I(inode)->ip_clusters);
 
@@ -1131,7 +1131,7 @@ static int ocfs2_force_read_journal(struct inode *inode)
 
 	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
 
-	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
+	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 	v_blkno = 0;
 	while (v_blkno < num_blocks) {
 		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 452068b45749..415928536c5e 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -845,7 +845,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
 	struct ocfs2_move_extents *range = context->range;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	if ((inode->i_size == 0) || (range->me_len == 0))
+	if ((i_size_read(inode) == 0) || (range->me_len == 0))
 		return 0;
 
 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 332a281f217e..aaa50611ec66 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -234,7 +234,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
 		len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
 	}
 
-	if (gqinode->i_size < off + len) {
+	if (i_size_read(gqinode) < off + len) {
 		loff_t rounded_end =
 				ocfs2_align_bytes_to_blocks(sb, off + len);
 
@@ -778,8 +778,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
 		 */
 		WARN_ON(journal_current_handle());
 		status = ocfs2_extend_no_holes(gqinode, NULL,
-			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
-			gqinode->i_size);
+			i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits),
+			i_size_read(gqinode));
 		if (status < 0)
 			goto out_dq;
 	}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 27fe7ee4874c..2e4344be3b96 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -982,14 +982,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 
 	/* We are protected by dqio_sem so no locking needed */
 	status = ocfs2_extend_no_holes(lqinode, NULL,
-				       lqinode->i_size + 2 * sb->s_blocksize,
-				       lqinode->i_size);
+				       i_size_read(lqinode) + 2 * sb->s_blocksize,
+				       i_size_read(lqinode));
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
 	}
 	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
-					  lqinode->i_size + 2 * sb->s_blocksize);
+					  i_size_read(lqinode) + 2 * sb->s_blocksize);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
@@ -1125,14 +1125,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 
 	/* We are protected by dqio_sem so no locking needed */
 	status = ocfs2_extend_no_holes(lqinode, NULL,
-				       lqinode->i_size + sb->s_blocksize,
-				       lqinode->i_size);
+				       i_size_read(lqinode) + sb->s_blocksize,
+				       i_size_read(lqinode));
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;
 	}
 	status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
-					  lqinode->i_size + sb->s_blocksize);
+					  i_size_read(lqinode) + sb->s_blocksize);
 	if (status < 0) {
 		mlog_errno(status);
 		goto out;

From 98ac9125c5afed8c5d2e4c5824988f8ad51814e1 Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Wed, 11 Sep 2013 14:19:46 -0700
Subject: [PATCH 009/303] ocfs2: dlm_request_all_locks() should deal with the
 status sent from target node

dlm_request_all_locks() should deal with the status sent from target node
if DLM_LOCK_REQUEST_MSG is sent successfully, or recovery master will fall
into endless loop, waiting for other nodes to send locks and
DLM_RECO_DATA_DONE_MSG to me.

        NodeA                                  NodeB
                                     selected as recovery master
                                     dlm_remaster_locks()
                                     ->dlm_request_all_locks()
                                     send DLM_LOCK_REQUEST_MSG to nodeA

It happened that NodeA cannot alloc memory when it processes this
message.  dlm_request_all_locks_handler() do not queue
dlm_request_all_locks_worker and returns -ENOMEM.  It will never send
locks and DLM_RECO_DATA_DONE_MSG to NodeB.

                                    NodeB do not deal with the status
                                    sent from nodeA, and will fall in
                                    endless loop waiting for the
                                    recovery state of NodeA to be
                                    changed.

Signed-off-by: joyce <xuejiufei@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Jeff Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 773bd32bfd8c..f94550218152 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -787,6 +787,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 {
 	struct dlm_lock_request lr;
 	int ret;
+	int status;
 
 	mlog(0, "\n");
 
@@ -800,13 +801,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
 
 	// send message
 	ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key,
-				 &lr, sizeof(lr), request_from, NULL);
+				 &lr, sizeof(lr), request_from, &status);
 
 	/* negative status is handled by caller */
 	if (ret < 0)
 		mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
 		     "to recover dead node %u\n", dlm->name, ret,
 		     request_from, dead_node);
+	else
+		ret = status;
 	// return from here, then
 	// sleep until all received or error
 	return ret;

From 7e9b79370733945b25c24e09d663b07c3936d10c Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 11 Sep 2013 14:19:47 -0700
Subject: [PATCH 010/303] ocfs2: ac_bits_wanted should be local_alloc_bits when
 returns -ENOSPC

There is an issue in reserving and claiming space for localalloc, When
localalloc space is not enough, it would claim space from global_bitmap.
And if there is not enough free space in global_bitmap, the size of
claiming space would set to half of orignal size and retry.

The issue is as follows: osb->local_alloc_bits is set to half of orignal
size in ocfs2_recalc_la_window(), but ac->ac_bits_wanted is set to
osb->local_alloc_default_bits which is not changed.  localalloc always
reserves and claims local_alloc_default_bits space and returns ENOSPC.

So, ac->ac_bits_wanted should be osb->local_alloc_bits which would be
changed.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Jeff Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/localalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index aebeacd807c3..cd5496b7a0a3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -1082,7 +1082,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
 	}
 
 retry_enospc:
-	(*ac)->ac_bits_wanted = osb->local_alloc_default_bits;
+	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
 	if (status == -ENOSPC) {
 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
@@ -1154,7 +1154,7 @@ retry_enospc:
 		    OCFS2_LA_DISABLED)
 			goto bail;
 
-		ac->ac_bits_wanted = osb->local_alloc_default_bits;
+		ac->ac_bits_wanted = osb->local_alloc_bits;
 		status = ocfs2_claim_clusters(handle, ac,
 					      osb->local_alloc_bits,
 					      &cluster_off,

From 8dd7903e48df3779bc424196c22dc73b66d0643e Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Wed, 11 Sep 2013 14:19:49 -0700
Subject: [PATCH 011/303] fs/ocfs2/cluster/tcp.c: fix possible null pointer
 dereferences

Fix some possible null pointer dereferences that were detected by the
static code analyser, smatch.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Reported-by: Dan Carpenter <error27@gmail.com>
Reported-by: Guozhonghua <guozhonghua@h3c.com>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d644dc611425..d04a3c2fad3c 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -543,8 +543,9 @@ static void o2net_set_nn_state(struct o2net_node *nn,
 	}
 
 	if (was_valid && !valid) {
-		printk(KERN_NOTICE "o2net: No longer connected to "
-		       SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
+		if (old_sc)
+			printk(KERN_NOTICE "o2net: No longer connected to "
+				SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
 		o2net_complete_nodes_nsw(nn);
 	}
 
@@ -1695,13 +1696,12 @@ static void o2net_start_connect(struct work_struct *work)
 		ret = 0;
 
 out:
-	if (ret) {
+	if (ret && sc) {
 		printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
 		       " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
 		/* 0 err so that another will be queued and attempted
 		 * from set_nn_state */
-		if (sc)
-			o2net_ensure_shutdown(nn, sc, 0);
+		o2net_ensure_shutdown(nn, sc, 0);
 	}
 	if (sc)
 		sc_put(sc);

From df53cd3b70712cd136f10ef79457623c5c3764a4 Mon Sep 17 00:00:00 2001
From: Dong Fang <yp.fangdong@gmail.com>
Date: Wed, 11 Sep 2013 14:19:50 -0700
Subject: [PATCH 012/303] ocfs2: use list_for_each_entry() instead of
 list_for_each()

[dan.carpenter@oracle.com: fix up some NULL dereference bugs]
Signed-off-by: Dong Fang <yp.fangdong@gmail.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jeff Liu <jeff.liu@oracle.com>
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 14 +++++---------
 fs/ocfs2/dlm/dlmast.c        |  8 +++-----
 fs/ocfs2/dlm/dlmcommon.h     |  4 +---
 fs/ocfs2/dlm/dlmconvert.c    | 18 +++++++-----------
 fs/ocfs2/dlm/dlmdebug.c      | 15 ++++-----------
 fs/ocfs2/dlm/dlmdomain.c     | 35 ++++++++++++-----------------------
 fs/ocfs2/dlm/dlmlock.c       |  9 ++-------
 fs/ocfs2/dlm/dlmmaster.c     | 18 +++++-------------
 fs/ocfs2/dlm/dlmthread.c     | 19 +++++--------------
 fs/ocfs2/dlm/dlmunlock.c     |  4 +---
 10 files changed, 45 insertions(+), 99 deletions(-)

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 5c1c864e81cc..25b72e82b8fa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -628,11 +628,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
 				struct o2nm_node *node,
 				int idx)
 {
-	struct list_head *iter;
 	struct o2hb_callback_func *f;
 
-	list_for_each(iter, &hbcall->list) {
-		f = list_entry(iter, struct o2hb_callback_func, hc_item);
+	list_for_each_entry(f, &hbcall->list, hc_item) {
 		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
 		(f->hc_func)(node, idx, f->hc_data);
 	}
@@ -2516,8 +2514,7 @@ unlock:
 int o2hb_register_callback(const char *region_uuid,
 			   struct o2hb_callback_func *hc)
 {
-	struct o2hb_callback_func *tmp;
-	struct list_head *iter;
+	struct o2hb_callback_func *f;
 	struct o2hb_callback *hbcall;
 	int ret;
 
@@ -2540,10 +2537,9 @@ int o2hb_register_callback(const char *region_uuid,
 
 	down_write(&o2hb_callback_sem);
 
-	list_for_each(iter, &hbcall->list) {
-		tmp = list_entry(iter, struct o2hb_callback_func, hc_item);
-		if (hc->hc_priority < tmp->hc_priority) {
-			list_add_tail(&hc->hc_item, iter);
+	list_for_each_entry(f, &hbcall->list, hc_item) {
+		if (hc->hc_priority < f->hc_priority) {
+			list_add_tail(&hc->hc_item, &f->hc_item);
 			break;
 		}
 	}
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index fbec0be62326..b46278f9ae44 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -292,7 +292,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct dlm_lock *lock = NULL;
 	struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
 	char *name;
-	struct list_head *iter, *head=NULL;
+	struct list_head *head = NULL;
 	__be64 cookie;
 	u32 flags;
 	u8 node;
@@ -373,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	/* try convert queue for both ast/bast */
 	head = &res->converting;
 	lock = NULL;
-	list_for_each(iter, head) {
-		lock = list_entry (iter, struct dlm_lock, list);
+	list_for_each_entry(lock, head, list) {
 		if (lock->ml.cookie == cookie)
 			goto do_ast;
 	}
@@ -385,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
 	else
 		head = &res->granted;
 
-	list_for_each(iter, head) {
-		lock = list_entry (iter, struct dlm_lock, list);
+	list_for_each_entry(lock, head, list) {
 		if (lock->ml.cookie == cookie)
 			goto do_ast;
 	}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index de854cca12a2..e0517762fcc0 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -1079,11 +1079,9 @@ static inline int dlm_lock_compatible(int existing, int request)
 static inline int dlm_lock_on_list(struct list_head *head,
 				   struct dlm_lock *lock)
 {
-	struct list_head *iter;
 	struct dlm_lock *tmplock;
 
-	list_for_each(iter, head) {
-		tmplock = list_entry(iter, struct dlm_lock, list);
+	list_for_each_entry(tmplock, head, list) {
 		if (tmplock == lock)
 			return 1;
 	}
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 29a886d1e82c..e36d63ff1783 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -123,7 +123,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
 					   int *kick_thread)
 {
 	enum dlm_status status = DLM_NORMAL;
-	struct list_head *iter;
 	struct dlm_lock *tmplock=NULL;
 
 	assert_spin_locked(&res->spinlock);
@@ -185,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
 
 	/* upconvert from here on */
 	status = DLM_NORMAL;
-	list_for_each(iter, &res->granted) {
-		tmplock = list_entry(iter, struct dlm_lock, list);
+	list_for_each_entry(tmplock, &res->granted, list) {
 		if (tmplock == lock)
 			continue;
 		if (!dlm_lock_compatible(tmplock->ml.type, type))
 			goto switch_queues;
 	}
 
-	list_for_each(iter, &res->converting) {
-		tmplock = list_entry(iter, struct dlm_lock, list);
+	list_for_each_entry(tmplock, &res->converting, list) {
 		if (!dlm_lock_compatible(tmplock->ml.type, type))
 			goto switch_queues;
 		/* existing conversion requests take precedence */
@@ -424,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct dlm_ctxt *dlm = data;
 	struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
-	struct list_head *iter;
 	struct dlm_lock *lock = NULL;
+	struct dlm_lock *tmp_lock;
 	struct dlm_lockstatus *lksb;
 	enum dlm_status status = DLM_NORMAL;
 	u32 flags;
@@ -471,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 		dlm_error(status);
 		goto leave;
 	}
-	list_for_each(iter, &res->granted) {
-		lock = list_entry(iter, struct dlm_lock, list);
-		if (lock->ml.cookie == cnv->cookie &&
-		    lock->ml.node == cnv->node_idx) {
+	list_for_each_entry(tmp_lock, &res->granted, list) {
+		if (tmp_lock->ml.cookie == cnv->cookie &&
+		    tmp_lock->ml.node == cnv->node_idx) {
+			lock = tmp_lock;
 			dlm_lock_get(lock);
 			break;
 		}
-		lock = NULL;
 	}
 	spin_unlock(&res->spinlock);
 	if (!lock) {
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 0e28e242226d..e33cd7a3c582 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -96,7 +96,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
 
 void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 {
-	struct list_head *iter2;
 	struct dlm_lock *lock;
 	char buf[DLM_LOCKID_NAME_MAX];
 
@@ -118,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
 	       res->inflight_locks, atomic_read(&res->asts_reserved));
 	dlm_print_lockres_refmap(res);
 	printk("  granted queue:\n");
-	list_for_each(iter2, &res->granted) {
-		lock = list_entry(iter2, struct dlm_lock, list);
+	list_for_each_entry(lock, &res->granted, list) {
 		__dlm_print_lock(lock);
 	}
 	printk("  converting queue:\n");
-	list_for_each(iter2, &res->converting) {
-		lock = list_entry(iter2, struct dlm_lock, list);
+	list_for_each_entry(lock, &res->converting, list) {
 		__dlm_print_lock(lock);
 	}
 	printk("  blocked queue:\n");
-	list_for_each(iter2, &res->blocked) {
-		lock = list_entry(iter2, struct dlm_lock, list);
+	list_for_each_entry(lock, &res->blocked, list) {
 		__dlm_print_lock(lock);
 	}
 }
@@ -446,7 +442,6 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 {
 	struct dlm_master_list_entry *mle;
 	struct hlist_head *bucket;
-	struct hlist_node *list;
 	int i, out = 0;
 	unsigned long total = 0, longest = 0, bucket_count = 0;
 
@@ -456,9 +451,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
 	spin_lock(&dlm->master_lock);
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
 		bucket = dlm_master_hash(dlm, i);
-		hlist_for_each(list, bucket) {
-			mle = hlist_entry(list, struct dlm_master_list_entry,
-					  master_hash_node);
+		hlist_for_each_entry(mle, bucket, master_hash_node) {
 			++total;
 			++bucket_count;
 			if (len - out < 200)
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index dbb17c07656a..8b3382abf840 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -193,7 +193,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
 						     unsigned int hash)
 {
 	struct hlist_head *bucket;
-	struct hlist_node *list;
+	struct dlm_lock_resource *res;
 
 	mlog(0, "%.*s\n", len, name);
 
@@ -201,9 +201,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
 
 	bucket = dlm_lockres_hash(dlm, hash);
 
-	hlist_for_each(list, bucket) {
-		struct dlm_lock_resource *res = hlist_entry(list,
-			struct dlm_lock_resource, hash_node);
+	hlist_for_each_entry(res, bucket, hash_node) {
 		if (res->lockname.name[0] != name[0])
 			continue;
 		if (unlikely(res->lockname.len != len))
@@ -262,22 +260,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
 
 static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len)
 {
-	struct dlm_ctxt *tmp = NULL;
-	struct list_head *iter;
+	struct dlm_ctxt *tmp;
 
 	assert_spin_locked(&dlm_domain_lock);
 
 	/* tmp->name here is always NULL terminated,
 	 * but domain may not be! */
-	list_for_each(iter, &dlm_domains) {
-		tmp = list_entry (iter, struct dlm_ctxt, list);
+	list_for_each_entry(tmp, &dlm_domains, list) {
 		if (strlen(tmp->name) == len &&
 		    memcmp(tmp->name, domain, len)==0)
-			break;
-		tmp = NULL;
+			return tmp;
 	}
 
-	return tmp;
+	return NULL;
 }
 
 /* For null terminated domain strings ONLY */
@@ -366,25 +361,22 @@ static void __dlm_get(struct dlm_ctxt *dlm)
  * you shouldn't trust your pointer. */
 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm)
 {
-	struct list_head *iter;
-	struct dlm_ctxt *target = NULL;
+	struct dlm_ctxt *target;
+	struct dlm_ctxt *ret = NULL;
 
 	spin_lock(&dlm_domain_lock);
 
-	list_for_each(iter, &dlm_domains) {
-		target = list_entry (iter, struct dlm_ctxt, list);
-
+	list_for_each_entry(target, &dlm_domains, list) {
 		if (target == dlm) {
 			__dlm_get(target);
+			ret = target;
 			break;
 		}
-
-		target = NULL;
 	}
 
 	spin_unlock(&dlm_domain_lock);
 
-	return target;
+	return ret;
 }
 
 int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
@@ -2296,13 +2288,10 @@ static DECLARE_RWSEM(dlm_callback_sem);
 void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
 					int node_num)
 {
-	struct list_head *iter;
 	struct dlm_eviction_cb *cb;
 
 	down_read(&dlm_callback_sem);
-	list_for_each(iter, &dlm->dlm_eviction_callbacks) {
-		cb = list_entry(iter, struct dlm_eviction_cb, ec_item);
-
+	list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) {
 		cb->ec_func(node_num, cb->ec_data);
 	}
 	up_read(&dlm_callback_sem);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 47e67c2d228f..5d32f7511f74 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -91,19 +91,14 @@ void dlm_destroy_lock_cache(void)
 static int dlm_can_grant_new_lock(struct dlm_lock_resource *res,
 				  struct dlm_lock *lock)
 {
-	struct list_head *iter;
 	struct dlm_lock *tmplock;
 
-	list_for_each(iter, &res->granted) {
-		tmplock = list_entry(iter, struct dlm_lock, list);
-
+	list_for_each_entry(tmplock, &res->granted, list) {
 		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
 			return 0;
 	}
 
-	list_for_each(iter, &res->converting) {
-		tmplock = list_entry(iter, struct dlm_lock, list);
-
+	list_for_each_entry(tmplock, &res->converting, list) {
 		if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type))
 			return 0;
 		if (!dlm_lock_compatible(tmplock->ml.convert_type,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 33ecbe0e6734..cf0f103963b1 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -342,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm,
 {
 	struct dlm_master_list_entry *tmpmle;
 	struct hlist_head *bucket;
-	struct hlist_node *list;
 	unsigned int hash;
 
 	assert_spin_locked(&dlm->master_lock);
 
 	hash = dlm_lockid_hash(name, namelen);
 	bucket = dlm_master_hash(dlm, hash);
-	hlist_for_each(list, bucket) {
-		tmpmle = hlist_entry(list, struct dlm_master_list_entry,
-				     master_hash_node);
+	hlist_for_each_entry(tmpmle, bucket, master_hash_node) {
 		if (!dlm_mle_equal(dlm, tmpmle, name, namelen))
 			continue;
 		dlm_get_mle(tmpmle);
@@ -3183,7 +3180,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 	struct dlm_master_list_entry *mle;
 	struct dlm_lock_resource *res;
 	struct hlist_head *bucket;
-	struct hlist_node *list;
+	struct hlist_node *tmp;
 	unsigned int i;
 
 	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
@@ -3194,10 +3191,7 @@ top:
 	spin_lock(&dlm->master_lock);
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
 		bucket = dlm_master_hash(dlm, i);
-		hlist_for_each(list, bucket) {
-			mle = hlist_entry(list, struct dlm_master_list_entry,
-					  master_hash_node);
-
+		hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
 			BUG_ON(mle->type != DLM_MLE_BLOCK &&
 			       mle->type != DLM_MLE_MASTER &&
 			       mle->type != DLM_MLE_MIGRATION);
@@ -3378,7 +3372,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
 	int i;
 	struct hlist_head *bucket;
 	struct dlm_master_list_entry *mle;
-	struct hlist_node *tmp, *list;
+	struct hlist_node *tmp;
 
 	/*
 	 * We notified all other nodes that we are exiting the domain and
@@ -3394,9 +3388,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm)
 
 	for (i = 0; i < DLM_HASH_BUCKETS; i++) {
 		bucket = dlm_master_hash(dlm, i);
-		hlist_for_each_safe(list, tmp, bucket) {
-			mle = hlist_entry(list, struct dlm_master_list_entry,
-					  master_hash_node);
+		hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) {
 			if (mle->type != DLM_MLE_BLOCK) {
 				mlog(ML_ERROR, "bad mle: %p\n", mle);
 				dlm_print_one_mle(mle);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index e73c833fc2a1..9db869de829d 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -286,8 +286,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm,
 			      struct dlm_lock_resource *res)
 {
 	struct dlm_lock *lock, *target;
-	struct list_head *iter;
-	struct list_head *head;
 	int can_grant = 1;
 
 	/*
@@ -314,9 +312,7 @@ converting:
 		     dlm->name, res->lockname.len, res->lockname.name);
 		BUG();
 	}
-	head = &res->granted;
-	list_for_each(iter, head) {
-		lock = list_entry(iter, struct dlm_lock, list);
+	list_for_each_entry(lock, &res->granted, list) {
 		if (lock==target)
 			continue;
 		if (!dlm_lock_compatible(lock->ml.type,
@@ -333,9 +329,8 @@ converting:
 					target->ml.convert_type;
 		}
 	}
-	head = &res->converting;
-	list_for_each(iter, head) {
-		lock = list_entry(iter, struct dlm_lock, list);
+
+	list_for_each_entry(lock, &res->converting, list) {
 		if (lock==target)
 			continue;
 		if (!dlm_lock_compatible(lock->ml.type,
@@ -384,9 +379,7 @@ blocked:
 		goto leave;
 	target = list_entry(res->blocked.next, struct dlm_lock, list);
 
-	head = &res->granted;
-	list_for_each(iter, head) {
-		lock = list_entry(iter, struct dlm_lock, list);
+	list_for_each_entry(lock, &res->granted, list) {
 		if (lock==target)
 			continue;
 		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
@@ -400,9 +393,7 @@ blocked:
 		}
 	}
 
-	head = &res->converting;
-	list_for_each(iter, head) {
-		lock = list_entry(iter, struct dlm_lock, list);
+	list_for_each_entry(lock, &res->converting, list) {
 		if (lock==target)
 			continue;
 		if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) {
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 850aa7e87537..5698b52cf5c9 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -388,7 +388,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 	struct dlm_ctxt *dlm = data;
 	struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf;
 	struct dlm_lock_resource *res = NULL;
-	struct list_head *iter;
 	struct dlm_lock *lock = NULL;
 	enum dlm_status status = DLM_NORMAL;
 	int found = 0, i;
@@ -458,8 +457,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 
 	for (i=0; i<3; i++) {
-		list_for_each(iter, queue) {
-			lock = list_entry(iter, struct dlm_lock, list);
+		list_for_each_entry(lock, queue, list) {
 			if (lock->ml.cookie == unlock->cookie &&
 		    	    lock->ml.node == unlock->node_idx) {
 				dlm_lock_get(lock);

From 3d94ea51c1d8db6f41268a9d2aea5f5771e9a8d3 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:19:51 -0700
Subject: [PATCH 013/303] ocfs2: clean up dead code in ocfs2_acl_from_xattr()

In ocfs2_acl_from_xattr(), if size is less than sizeof(struct
posix_acl_entry), it returns ERR_PTR(-EINVAL) directly.  Then assign (size
/ sizeof(struct posix_acl_entry)) to count which will be at least 1, that
means the following branch (count < 0) and (count == 0) will never be
true.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/acl.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 8a404576fb26..b4f788e0ca31 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -51,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
 		return ERR_PTR(-EINVAL);
 
 	count = size / sizeof(struct posix_acl_entry);
-	if (count < 0)
-		return ERR_PTR(-EINVAL);
-	if (count == 0)
-		return NULL;
 
 	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)

From 2b0f6eae2dd2f7f21dbf93241938a687f6757dea Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:19:52 -0700
Subject: [PATCH 014/303] ocfs2: add missing return value check of
 ocfs2_get_clusters()

In ocfs2_attach_refcount_tree() and ocfs2_duplicate_extent_list(), if
error occurs when calling ocfs2_get_clusters(), it will go with
unexpected behavior as local variables p_cluster, num_clusters and
ext_flags are declared without initialization.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/refcounttree.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a70d604593b6..bf4dfc14bb2c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3854,7 +3854,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 	while (cpos < clusters) {
 		ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
 					 &num_clusters, &ext_flags);
-
+		if (ret) {
+			mlog_errno(ret);
+			goto unlock;
+		}
 		if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
 			ret = ocfs2_add_refcount_flag(inode, &di_et,
 						      &ref_tree->rf_ci,
@@ -4025,7 +4028,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode,
 	while (cpos < clusters) {
 		ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
 					 &num_clusters, &ext_flags);
-
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
 		if (p_cluster) {
 			ret = ocfs2_add_refcounted_extent(t_inode, &et,
 							  ref_ci, ref_root_bh,

From 4704aa30fc35010dd9c3ce1d9d2e77af09c2c081 Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 11 Sep 2013 14:19:53 -0700
Subject: [PATCH 015/303] ocfs2: fix a memory leak in __ocfs2_move_extents()

The ocfs2 path is not properly freed which leads to a memory leak at
__ocfs2_move_extents().

This patch stops the leaks of the ocfs2_path structure.

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reviewed-by: Younger Liu <younger.liu@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/move_extents.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 415928536c5e..3d3f3c83065c 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -152,6 +152,7 @@ static int __ocfs2_move_extent(handle_t *handle,
 	}
 
 out:
+	ocfs2_free_path(path);
 	return ret;
 }
 

From 17caf9555edc27a0c6df512de0879b357ebacae4 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:19:55 -0700
Subject: [PATCH 016/303] ocfs2: add the missing return value check of
 ocfs2_xattr_get_clusters

In ocfs2_xattr_value_attach_refcount(), if error occurs when calling
ocfs2_xattr_get_clusters(), it will go with unexpected behavior since
local variables p_cluster, num_clusters and ext_flags are declared without
initialization.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Acked-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 317ef0abccbb..1cbc2231a9f2 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5881,6 +5881,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode,
 	while (cpos < clusters) {
 		ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
 					       &num_clusters, el, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
 
 		cpos += num_clusters;
 		if ((ext_flags & OCFS2_EXT_REFCOUNTED))

From 6ea437a3639b15e312f81819bb20f737ff596194 Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 11 Sep 2013 14:19:56 -0700
Subject: [PATCH 017/303] ocfs2: free meta_ac and data_ac when
 ocfs2_start_trans fails in ocfs2_xattr_set()

In ocfs2_xattr_set(), if ocfs2_start_trans failed, meta_ac and data_ac
should be free.  Otherwise, It would lead to a memory leak.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Cc: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 1cbc2231a9f2..18330f5b57be 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -3505,7 +3505,7 @@ int ocfs2_xattr_set(struct inode *inode,
 	int ret, credits, ref_meta = 0, ref_credits = 0;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct inode *tl_inode = osb->osb_tl_inode;
-	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+	struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, };
 	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	struct ocfs2_xattr_info xi = {
@@ -3609,13 +3609,14 @@ int ocfs2_xattr_set(struct inode *inode,
 	if (IS_ERR(ctxt.handle)) {
 		ret = PTR_ERR(ctxt.handle);
 		mlog_errno(ret);
-		goto cleanup;
+		goto out_free_ac;
 	}
 
 	ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
 
 	ocfs2_commit_trans(osb, ctxt.handle);
 
+out_free_ac:
 	if (ctxt.data_ac)
 		ocfs2_free_alloc_context(ctxt.data_ac);
 	if (ctxt.meta_ac)

From 69b2bd16d9792085d57865fcaac55753803a4f5d Mon Sep 17 00:00:00 2001
From: Xue jiufei <xuejiufei@huawei.com>
Date: Wed, 11 Sep 2013 14:19:57 -0700
Subject: [PATCH 018/303] ocfs2/dlm: force clean refmap when doing local
 cleanup

dlm_do_local_recovery_cleanup() should force clean refmap if the owner of
lockres is UNKNOWN.  Otherwise node may hang when umounting filesystems.
Here's the situation:

	Node1                                    Node2
dlmlock()
  -> dlm_get_lock_resource()
send DLM_MASTER_REQUEST_MSG to
other nodes.

                                       trying to master this lockres,
                                       return MAYBE.

selected as the master of lockresA,
set mle->master to Node1,
and do assert_master,
send DLM_ASSERT_MASTER_MSG to Node2.
                                       Node 2 has interest on lockresA
                                       and return
                                       DLM_ASSERT_RESPONSE_MASTERY_REF
                                       then something happened and
                                       Node2 crashed.

Receiving DLM_ASSERT_RESPONSE_MASTERY_REF, set Node2 into refmap, and keep
sending DLM_ASSERT_MASTER_MSG to other nodes

o2hb found node2 down, calling dlm_hb_node_down() -->
dlm_do_local_recovery_cleanup() the master of lockresA is still UNKNOWN,
no need to call dlm_free_dead_locks().

Set the master of lockresA to Node1, but Node2 stills remains in refmap.

When Node1 umount, it found that the refmap of lockresA is not empty and
attempted to migrate it to Node2, But Node2 is already down, so umount
hang, trying to migrate lockresA again and again.

Signed-off-by: joyce <xuejiufei@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f94550218152..0b5adca1b178 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2331,6 +2331,14 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
 			} else if (res->owner == dlm->node_num) {
 				dlm_free_dead_locks(dlm, res, dead_node);
 				__dlm_lockres_calc_usage(dlm, res);
+			} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+				if (test_bit(dead_node, res->refmap)) {
+					mlog(0, "%s:%.*s: dead node %u had a ref, but had "
+						"no locks and had not purged before dying\n",
+						dlm->name, res->lockname.len,
+						res->lockname.name, dead_node);
+					dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
+				}
 			}
 			spin_unlock(&res->spinlock);
 		}

From 6cae6d3189ef34647bca9b9b1d240ebd760e5dea Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:19:58 -0700
Subject: [PATCH 019/303] ocfs2: fix possible double free in
 ocfs2_reflink_xattr_rec

In ocfs2_reflink_xattr_rec(), meta_ac and data_ac are allocated by calling
ocfs2_lock_reflink_xattr_rec_allocators().

Once an error occurs when allocating *data_ac, it frees *meta_ac which is
allocated before.  Here it mistakenly sets meta_ac to NULL but *meta_ac.
Then ocfs2_reflink_xattr_rec() will try to free meta_ac again which is
already invalid.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 18330f5b57be..6ce0686eab72 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6802,7 +6802,7 @@ out:
 	if (ret) {
 		if (*meta_ac) {
 			ocfs2_free_alloc_context(*meta_ac);
-			meta_ac = NULL;
+			*meta_ac = NULL;
 		}
 	}
 

From 7aebff18b91ebdefe15bb7d3f5d711df8312a7fb Mon Sep 17 00:00:00 2001
From: Younger Liu <younger.liu@huawei.com>
Date: Wed, 11 Sep 2013 14:19:59 -0700
Subject: [PATCH 020/303] ocfs2: free path in ocfs2_remove_inode_range()

In ocfs2_remove_inode_range(), there is a memory leak.  The variable path
has allocated memory with ocfs2_new_path_from_et(), but it is not free.

Signed-off-by: Younger Liu <younger.liu@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/file.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 409c549ae02a..4f8197caa487 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1796,6 +1796,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
 
 out:
+	ocfs2_free_path(path);
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 	ocfs2_run_deallocs(osb, &dealloc);
 

From 9a239e4c68df78888f67b1d4e7d507e24ac6764f Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:20:00 -0700
Subject: [PATCH 021/303] ocfs2: adjust code style for
 o2net_handler_tree_lookup()

Code in o2net_handler_tree_lookup() may be corrupted by mistake.  So
adjust it to promote readability.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index d04a3c2fad3c..8c50c238577a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -766,32 +766,32 @@ static struct o2net_msg_handler *
 o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p,
 			  struct rb_node **ret_parent)
 {
-        struct rb_node **p = &o2net_handler_tree.rb_node;
-        struct rb_node *parent = NULL;
+	struct rb_node **p = &o2net_handler_tree.rb_node;
+	struct rb_node *parent = NULL;
 	struct o2net_msg_handler *nmh, *ret = NULL;
 	int cmp;
 
-        while (*p) {
-                parent = *p;
-                nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
+	while (*p) {
+		parent = *p;
+		nmh = rb_entry(parent, struct o2net_msg_handler, nh_node);
 		cmp = o2net_handler_cmp(nmh, msg_type, key);
 
-                if (cmp < 0)
-                        p = &(*p)->rb_left;
-                else if (cmp > 0)
-                        p = &(*p)->rb_right;
-                else {
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else if (cmp > 0)
+			p = &(*p)->rb_right;
+		else {
 			ret = nmh;
-                        break;
+			break;
 		}
-        }
+	}
 
-        if (ret_p != NULL)
-                *ret_p = p;
-        if (ret_parent != NULL)
-                *ret_parent = parent;
+	if (ret_p != NULL)
+		*ret_p = p;
+	if (ret_parent != NULL)
+		*ret_parent = parent;
 
-        return ret;
+	return ret;
 }
 
 static void o2net_handler_kref_release(struct kref *kref)

From 03dbe88aa9cd0d7b0a876b38bd75ce73b4522454 Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:20:01 -0700
Subject: [PATCH 022/303] ocfs2: avoid possible NULL pointer dereference in
 o2net_accept_one()

Since o2nm_get_node_by_num() may return NULL, we add this check in
o2net_accept_one() to avoid possible NULL pointer dereference.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/tcp.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8c50c238577a..2cd2406b4140 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1873,12 +1873,16 @@ static int o2net_accept_one(struct socket *sock)
 
 	if (o2nm_this_node() >= node->nd_num) {
 		local_node = o2nm_get_node_by_num(o2nm_this_node());
-		printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
-		       "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
-		       "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
-		       &(local_node->nd_ipv4_address),
-		       ntohs(local_node->nd_ipv4_port), node->nd_name,
-		       node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+		if (local_node)
+			printk(KERN_NOTICE "o2net: Unexpected connect attempt "
+					"seen at node '%s' (%u, %pI4:%d) from "
+					"node '%s' (%u, %pI4:%d)\n",
+					local_node->nd_name, local_node->nd_num,
+					&(local_node->nd_ipv4_address),
+					ntohs(local_node->nd_ipv4_port),
+					node->nd_name,
+					node->nd_num, &sin.sin_addr.s_addr,
+					ntohs(sin.sin_port));
 		ret = -EINVAL;
 		goto out;
 	}

From 6f8648e894498f769832b79399b1cfabd2973ea9 Mon Sep 17 00:00:00 2001
From: Joyce <xuejiufei@huawei.com>
Date: Wed, 11 Sep 2013 14:20:03 -0700
Subject: [PATCH 023/303] ocfs2: fix a tiny race case when firing callbacks

In o2hb_shutdown_slot() and o2hb_check_slot(), since event is defined as
local, it is only valid during the call stack.  So the following tiny race
case may happen in a multi-volumes mounted environment:

o2hb-vol1                         o2hb-vol2
1) o2hb_shutdown_slot
allocate local event1
2) queue_node_event
add event1 to global o2hb_node_events
                                  3) o2hb_shutdown_slot
                                  allocate local event2
                                  4) queue_node_event
                                  add event2 to global o2hb_node_events
                                  5) o2hb_run_event_list
                                  delete event1 from o2hb_node_events
6) o2hb_run_event_list
event1 empty, return
7) o2hb_shutdown_slot
event1 lifecycle ends
                                  8) o2hb_fire_callbacks
                                  event1 is already *invalid*

This patch lets it wait on o2hb_callback_sem when another thread is firing
callbacks.  And for performance consideration, we only call
o2hb_run_event_list when there is an event queued.

Signed-off-by: Joyce <xuejiufei@huawei.com>
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/cluster/heartbeat.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 25b72e82b8fa..363f0dcc924f 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -639,16 +639,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
 /* Will run the list in order until we process the passed event */
 static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
 {
-	int empty;
 	struct o2hb_callback *hbcall;
 	struct o2hb_node_event *event;
 
-	spin_lock(&o2hb_live_lock);
-	empty = list_empty(&queued_event->hn_item);
-	spin_unlock(&o2hb_live_lock);
-	if (empty)
-		return;
-
 	/* Holding callback sem assures we don't alter the callback
 	 * lists when doing this, and serializes ourselves with other
 	 * processes wanting callbacks. */
@@ -707,6 +700,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
 	struct o2hb_node_event event =
 		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
 	struct o2nm_node *node;
+	int queued = 0;
 
 	node = o2nm_get_node_by_num(slot->ds_node_num);
 	if (!node)
@@ -724,11 +718,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
 
 			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
 					      slot->ds_node_num);
+			queued = 1;
 		}
 	}
 	spin_unlock(&o2hb_live_lock);
 
-	o2hb_run_event_list(&event);
+	if (queued)
+		o2hb_run_event_list(&event);
 
 	o2nm_node_put(node);
 }
@@ -788,6 +784,7 @@ static int o2hb_check_slot(struct o2hb_region *reg,
 	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
 	unsigned int slot_dead_ms;
 	int tmp;
+	int queued = 0;
 
 	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
 
@@ -881,6 +878,7 @@ fire_callbacks:
 					      slot->ds_node_num);
 
 			changed = 1;
+			queued = 1;
 		}
 
 		list_add_tail(&slot->ds_live_item,
@@ -932,6 +930,7 @@ fire_callbacks:
 					      node, slot->ds_node_num);
 
 			changed = 1;
+			queued = 1;
 		}
 
 		/* We don't clear this because the node is still
@@ -947,7 +946,8 @@ fire_callbacks:
 out:
 	spin_unlock(&o2hb_live_lock);
 
-	o2hb_run_event_list(&event);
+	if (queued)
+		o2hb_run_event_list(&event);
 
 	if (node)
 		o2nm_node_put(node);

From a72e27d3727b383be39498f8b5c9b944d30e0f9b Mon Sep 17 00:00:00 2001
From: Joseph Qi <joseph.qi@huawei.com>
Date: Wed, 11 Sep 2013 14:20:04 -0700
Subject: [PATCH 024/303] ocfs2: remove unused variable ip in
 dlmfs_get_root_inode()

Variable ip in dlmfs_get_root_inode() is defined but not used.  So clean
it up.

Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reviewed-by: Jie Liu <jeff.liu@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/dlmfs/dlmfs.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 12bafb7265ce..efa2b3d339e3 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -401,11 +401,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
 	umode_t mode = S_IFDIR | 0755;
-	struct dlmfs_inode_private *ip;
 
 	if (inode) {
-		ip = DLMFS_I(inode);
-
 		inode->i_ino = get_next_ino();
 		inode_init_owner(inode, NULL, mode);
 		inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;

From 28e8be31803b19d0d8f76216cb11b480b8a98bec Mon Sep 17 00:00:00 2001
From: Jie Liu <jeff.liu@oracle.com>
Date: Wed, 11 Sep 2013 14:20:05 -0700
Subject: [PATCH 025/303] ocfs2: fix the end cluster offset of FIEMAP

Call fiemap ioctl(2) with given start offset as well as an desired mapping
range should show extents if possible.  However, we somehow figure out the
end offset of mapping via 'mapping_end -= cpos' before iterating the
extent records which would cause problems if the given fiemap length is
too small to a cluster size, e.g,

Cluster size 4096:
debugfs.ocfs2 1.6.3
        Block Size Bits: 12   Cluster Size Bits: 12

The extended fiemap test utility From David:
https://gist.github.com/anonymous/6172331

# dd if=/dev/urandom of=/ocfs2/test_file bs=1M count=1000
# ./fiemap /ocfs2/test_file 4096 10
start: 4096, length: 10
File /ocfs2/test_file has 0 extents:
#	Logical          Physical         Length           Flags
	^^^^^ <-- No extent is shown

In this case, at ocfs2_fiemap(): cpos == mapping_end == 1. Hence the
loop of searching extent records was not executed at all.

This patch remove the in question 'mapping_end -= cpos', and loops
until the cpos is larger than the mapping_end as usual.

# ./fiemap /ocfs2/test_file 4096 10
start: 4096, length: 10
File /ocfs2/test_file has 1 extents:
#	Logical          Physical         Length           Flags
0:	0000000000000000 0000000056a01000 0000000006a00000 0000

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
Reported-by: David Weber <wb@munzinger.de>
Tested-by: David Weber <wb@munzinger.de>
Cc: Sunil Mushran <sunil.mushran@gmail.com>
Cc: Mark Fashen <mfasheh@suse.de>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ocfs2/extent_map.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 4bf2b763467f..767370b656ca 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -781,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	cpos = map_start >> osb->s_clustersize_bits;
 	mapping_end = ocfs2_clusters_for_bytes(inode->i_sb,
 					       map_start + map_len);
-	mapping_end -= cpos;
 	is_last = 0;
 	while (cpos < mapping_end && !is_last) {
 		u32 fe_flags;

From e1403b8edf669ff49bbdf602cc97fefa2760cb15 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:06 -0700
Subject: [PATCH 026/303] include/linux/sched.h: don't use task->pid/tgid in
 same_thread_group/has_group_leader_pid

task_struct->pid/tgid should go away.

1. Change same_thread_group() to use task->signal for comparison.

2. Change has_group_leader_pid(task) to compare task_pid(task) with
   signal->leader_pid.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Sergey Dyasly <dserrg@gmail.com>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ce1e1c0aaa33..45f254dddafc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2169,15 +2169,15 @@ static inline bool thread_group_leader(struct task_struct *p)
  * all we care about is that we have a task with the appropriate
  * pid, we don't actually care if we have the right task.
  */
-static inline int has_group_leader_pid(struct task_struct *p)
+static inline bool has_group_leader_pid(struct task_struct *p)
 {
-	return p->pid == p->tgid;
+	return task_pid(p) == p->signal->leader_pid;
 }
 
 static inline
-int same_thread_group(struct task_struct *p1, struct task_struct *p2)
+bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
 {
-	return p1->tgid == p2->tgid;
+	return p1->signal == p2->signal;
 }
 
 static inline struct task_struct *next_thread(const struct task_struct *p)

From bb8e0e84b30afc9827931c9773d75d5c99fcddff Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:20:07 -0700
Subject: [PATCH 027/303] block: replace strict_strtoul() with kstrtoul()

The use of strict_strtoul() is not preferred, because strict_strtoul() is
obsolete.  Thus, kstrtoul() should be used.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/osdblk.c             | 2 +-
 drivers/block/rbd.c                | 2 +-
 drivers/block/xen-blkback/xenbus.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 1bbc681688e4..79aa179305b5 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -598,7 +598,7 @@ static ssize_t class_osdblk_remove(struct class *c,
 	unsigned long ul;
 	struct list_head *tmp;
 
-	rc = strict_strtoul(buf, 10, &ul);
+	rc = kstrtoul(buf, 10, &ul);
 	if (rc)
 		return rc;
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 39c51cc7fabc..b22a7d0fe5b7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -5132,7 +5132,7 @@ static ssize_t rbd_remove(struct bus_type *bus,
 	bool already = false;
 	int ret;
 
-	ret = strict_strtoul(buf, 10, &ul);
+	ret = kstrtoul(buf, 10, &ul);
 	if (ret)
 		return ret;
 
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index fe5c3cd10c34..c2014a0aa206 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -620,7 +620,7 @@ static void backend_changed(struct xenbus_watch *watch,
 	}
 
 	/* Front end dir is a number, which is used as the handle. */
-	err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
+	err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
 	if (err)
 		return;
 

From ed751e683c563be64322b9bfa0f0f7e5da9bd37c Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:20:08 -0700
Subject: [PATCH 028/303] block/blk-sysfs.c: replace strict_strtoul() with
 kstrtoul()

The usage of strict_strtoul() is not preferred, because strict_strtoul()
is obsolete.  Thus, kstrtoul() should be used.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-sysfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 5efc5a647183..3aa5b195f4dd 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -29,7 +29,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 	int err;
 	unsigned long v;
 
-	err = strict_strtoul(page, 10, &v);
+	err = kstrtoul(page, 10, &v);
 	if (err || v > UINT_MAX)
 		return -EINVAL;
 

From bab55417b10c95e6bff8cea315c315adfa009487 Mon Sep 17 00:00:00 2001
From: Cai Zhiyong <caizhiyong@huawei.com>
Date: Wed, 11 Sep 2013 14:20:09 -0700
Subject: [PATCH 029/303] block: support embedded device command line partition

Read block device partition table from command line.  The partition used
for fixed block device (eMMC) embedded device.  It is no MBR, save
storage space.  Bootloader can be easily accessed by absolute address of
data on the block device.  Users can easily change the partition.

This code reference MTD partition, source "drivers/mtd/cmdlinepart.c"
About the partition verbose reference
"Documentation/block/cmdline-partition.txt"

[akpm@linux-foundation.org: fix printk text]
[yongjun_wei@trendmicro.com.cn: fix error return code in parse_parts()]
Signed-off-by: Cai Zhiyong <caizhiyong@huawei.com>
Cc: Karel Zak <kzak@redhat.com>
Cc: "Wanglin (Albert)" <albert.wanglin@huawei.com>
Cc: Marius Groeger <mag@sysgo.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Brian Norris <computersforpeace@gmail.com>
Cc: Artem Bityutskiy <dedekind@infradead.org>
Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/block/cmdline-partition.txt |  39 ++++
 block/Kconfig                             |   6 +
 block/Makefile                            |   1 +
 block/cmdline-parser.c                    | 250 ++++++++++++++++++++++
 block/partitions/Kconfig                  |   7 +
 block/partitions/Makefile                 |   1 +
 block/partitions/check.c                  |   4 +
 block/partitions/cmdline.c                |  99 +++++++++
 block/partitions/cmdline.h                |   2 +
 include/linux/cmdline-parser.h            |  43 ++++
 10 files changed, 452 insertions(+)
 create mode 100644 Documentation/block/cmdline-partition.txt
 create mode 100644 block/cmdline-parser.c
 create mode 100644 block/partitions/cmdline.c
 create mode 100644 block/partitions/cmdline.h
 create mode 100644 include/linux/cmdline-parser.h

diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt
new file mode 100644
index 000000000000..2bbf4cc40c3f
--- /dev/null
+++ b/Documentation/block/cmdline-partition.txt
@@ -0,0 +1,39 @@
+Embedded device command line partition
+=====================================================================
+
+Read block device partition table from command line.
+The partition used for fixed block device (eMMC) embedded device.
+It is no MBR, save storage space. Bootloader can be easily accessed
+by absolute address of data on the block device.
+Users can easily change the partition.
+
+The format for the command line is just like mtdparts:
+
+blkdevparts=<blkdev-def>[;<blkdev-def>]
+  <blkdev-def> := <blkdev-id>:<partdef>[,<partdef>]
+    <partdef> := <size>[@<offset>](part-name)
+
+<blkdev-id>
+    block device disk name, embedded device used fixed block device,
+    it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0.
+
+<size>
+    partition size, in bytes, such as: 512, 1m, 1G.
+
+<offset>
+    partition start address, in bytes.
+
+(part-name)
+    partition name, kernel send uevent with "PARTNAME". application can create
+    a link to block device partition with the name "PARTNAME".
+    user space application can access partition by partition name.
+
+Example:
+    eMMC disk name is "mmcblk0" and "mmcblk0boot0"
+
+  bootargs:
+    'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
+
+  dmesg:
+    mmcblk0: p1(data0) p2(data1) p3()
+    mmcblk0boot0: p1(boot) p2(kernel)
diff --git a/block/Kconfig b/block/Kconfig
index a7e40a7c8214..7f38e40fee08 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -99,6 +99,12 @@ config BLK_DEV_THROTTLING
 
 	See Documentation/cgroups/blkio-controller.txt for more information.
 
+config CMDLINE_PARSER
+	bool "Block device command line partition parser"
+	default n
+	---help---
+	Parsing command line, get the partitions information.
+
 menu "Partition Types"
 
 source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 39b76ba66ffd..4fa4be544ece 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,3 +18,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+obj-$(CONFIG_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
new file mode 100644
index 000000000000..cc2637f8674e
--- /dev/null
+++ b/block/cmdline-parser.c
@@ -0,0 +1,250 @@
+/*
+ * Parse command line, get partition information
+ *
+ * Written by Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ */
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/cmdline-parser.h>
+
+static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
+{
+	int ret = 0;
+	struct cmdline_subpart *new_subpart;
+
+	*subpart = NULL;
+
+	new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
+	if (!new_subpart)
+		return -ENOMEM;
+
+	if (*partdef == '-') {
+		new_subpart->size = (sector_t)(~0ULL);
+		partdef++;
+	} else {
+		new_subpart->size = (sector_t)memparse(partdef, &partdef);
+		if (new_subpart->size < (sector_t)PAGE_SIZE) {
+			pr_warn("cmdline partition size is invalid.");
+			ret = -EINVAL;
+			goto fail;
+		}
+	}
+
+	if (*partdef == '@') {
+		partdef++;
+		new_subpart->from = (sector_t)memparse(partdef, &partdef);
+	} else {
+		new_subpart->from = (sector_t)(~0ULL);
+	}
+
+	if (*partdef == '(') {
+		int length;
+		char *next = strchr(++partdef, ')');
+
+		if (!next) {
+			pr_warn("cmdline partition format is invalid.");
+			ret = -EINVAL;
+			goto fail;
+		}
+
+		length = min_t(int, next - partdef,
+			       sizeof(new_subpart->name) - 1);
+		strncpy(new_subpart->name, partdef, length);
+		new_subpart->name[length] = '\0';
+
+		partdef = ++next;
+	} else
+		new_subpart->name[0] = '\0';
+
+	new_subpart->flags = 0;
+
+	if (!strncmp(partdef, "ro", 2)) {
+		new_subpart->flags |= PF_RDONLY;
+		partdef += 2;
+	}
+
+	if (!strncmp(partdef, "lk", 2)) {
+		new_subpart->flags |= PF_POWERUP_LOCK;
+		partdef += 2;
+	}
+
+	*subpart = new_subpart;
+	return 0;
+fail:
+	kfree(new_subpart);
+	return ret;
+}
+
+static void free_subpart(struct cmdline_parts *parts)
+{
+	struct cmdline_subpart *subpart;
+
+	while (parts->subpart) {
+		subpart = parts->subpart;
+		parts->subpart = subpart->next_subpart;
+		kfree(subpart);
+	}
+}
+
+static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+{
+	int ret = -EINVAL;
+	char *next;
+	int length;
+	struct cmdline_subpart **next_subpart;
+	struct cmdline_parts *newparts;
+	char buf[BDEVNAME_SIZE + 32 + 4];
+
+	*parts = NULL;
+
+	newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
+	if (!newparts)
+		return -ENOMEM;
+
+	next = strchr(bdevdef, ':');
+	if (!next) {
+		pr_warn("cmdline partition has no block device.");
+		goto fail;
+	}
+
+	length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
+	strncpy(newparts->name, bdevdef, length);
+	newparts->name[length] = '\0';
+	newparts->nr_subparts = 0;
+
+	next_subpart = &newparts->subpart;
+
+	while (next && *(++next)) {
+		bdevdef = next;
+		next = strchr(bdevdef, ',');
+
+		length = (!next) ? (sizeof(buf) - 1) :
+			min_t(int, next - bdevdef, sizeof(buf) - 1);
+
+		strncpy(buf, bdevdef, length);
+		buf[length] = '\0';
+
+		ret = parse_subpart(next_subpart, buf);
+		if (ret)
+			goto fail;
+
+		newparts->nr_subparts++;
+		next_subpart = &(*next_subpart)->next_subpart;
+	}
+
+	if (!newparts->subpart) {
+		pr_warn("cmdline partition has no valid partition.");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	*parts = newparts;
+
+	return 0;
+fail:
+	free_subpart(newparts);
+	kfree(newparts);
+	return ret;
+}
+
+void cmdline_parts_free(struct cmdline_parts **parts)
+{
+	struct cmdline_parts *next_parts;
+
+	while (*parts) {
+		next_parts = (*parts)->next_parts;
+		free_subpart(*parts);
+		kfree(*parts);
+		*parts = next_parts;
+	}
+}
+
+int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
+{
+	int ret;
+	char *buf;
+	char *pbuf;
+	char *next;
+	struct cmdline_parts **next_parts;
+
+	*parts = NULL;
+
+	next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	next_parts = parts;
+
+	while (next && *pbuf) {
+		next = strchr(pbuf, ';');
+		if (next)
+			*next = '\0';
+
+		ret = parse_parts(next_parts, pbuf);
+		if (ret)
+			goto fail;
+
+		if (next)
+			pbuf = ++next;
+
+		next_parts = &(*next_parts)->next_parts;
+	}
+
+	if (!*parts) {
+		pr_warn("cmdline partition has no valid partition.");
+		ret = -EINVAL;
+		goto fail;
+	}
+
+	ret = 0;
+done:
+	kfree(buf);
+	return ret;
+
+fail:
+	cmdline_parts_free(parts);
+	goto done;
+}
+
+struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+					 const char *bdev)
+{
+	while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
+		parts = parts->next_parts;
+	return parts;
+}
+
+/*
+ *  add_part()
+ *    0 success.
+ *    1 can not add so many partitions.
+ */
+void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+		       int slot,
+		       int (*add_part)(int, struct cmdline_subpart *, void *),
+		       void *param)
+
+{
+	sector_t from = 0;
+	struct cmdline_subpart *subpart;
+
+	for (subpart = parts->subpart; subpart;
+	     subpart = subpart->next_subpart, slot++) {
+		if (subpart->from == (sector_t)(~0ULL))
+			subpart->from = from;
+		else
+			from = subpart->from;
+
+		if (from >= disk_size)
+			break;
+
+		if (subpart->size > (disk_size - from))
+			subpart->size = disk_size - from;
+
+		from += subpart->size;
+
+		if (add_part(slot, subpart, param))
+			break;
+	}
+}
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 4cebb2f0d2f4..87a32086535d 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -260,3 +260,10 @@ config SYSV68_PARTITION
 	  partition table format used by Motorola Delta machines (using
 	  sysv68).
 	  Otherwise, say N.
+
+config CMDLINE_PARTITION
+	bool "Command line partition support" if PARTITION_ADVANCED
+	select CMDLINE_PARSER
+	help
+	  Say Y here if you would read the partitions table from bootargs.
+	  The format for the command line is just like mtdparts.
diff --git a/block/partitions/Makefile b/block/partitions/Makefile
index 2be4d7ba4e3a..37a95270503c 100644
--- a/block/partitions/Makefile
+++ b/block/partitions/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o
 obj-$(CONFIG_AMIGA_PARTITION) += amiga.o
 obj-$(CONFIG_ATARI_PARTITION) += atari.o
 obj-$(CONFIG_AIX_PARTITION) += aix.o
+obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o
 obj-$(CONFIG_MAC_PARTITION) += mac.o
 obj-$(CONFIG_LDM_PARTITION) += ldm.o
 obj-$(CONFIG_MSDOS_PARTITION) += msdos.o
diff --git a/block/partitions/check.c b/block/partitions/check.c
index 19ba207ea7d1..9ac1df74f699 100644
--- a/block/partitions/check.c
+++ b/block/partitions/check.c
@@ -34,6 +34,7 @@
 #include "efi.h"
 #include "karma.h"
 #include "sysv68.h"
+#include "cmdline.h"
 
 int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/
 
@@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = {
 	adfspart_check_ADFS,
 #endif
 
+#ifdef CONFIG_CMDLINE_PARTITION
+	cmdline_partition,
+#endif
 #ifdef CONFIG_EFI_PARTITION
 	efi_partition,		/* this must come before msdos */
 #endif
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
new file mode 100644
index 000000000000..56cf4ffad51e
--- /dev/null
+++ b/block/partitions/cmdline.c
@@ -0,0 +1,99 @@
+/*
+ * Copyright (C) 2013 HUAWEI
+ * Author: Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ * Read block device partition table from command line.
+ * The partition used for fixed block device (eMMC) embedded device.
+ * It is no MBR, save storage space. Bootloader can be easily accessed
+ * by absolute address of data on the block device.
+ * Users can easily change the partition.
+ *
+ * The format for the command line is just like mtdparts.
+ *
+ * Verbose config please reference "Documentation/block/cmdline-partition.txt"
+ *
+ */
+
+#include <linux/cmdline-parser.h>
+
+#include "check.h"
+#include "cmdline.h"
+
+static char *cmdline;
+static struct cmdline_parts *bdev_parts;
+
+static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
+{
+	int label_min;
+	struct partition_meta_info *info;
+	char tmp[sizeof(info->volname) + 4];
+	struct parsed_partitions *state = (struct parsed_partitions *)param;
+
+	if (slot >= state->limit)
+		return 1;
+
+	put_partition(state, slot, subpart->from >> 9,
+		      subpart->size >> 9);
+
+	info = &state->parts[slot].info;
+
+	label_min = min_t(int, sizeof(info->volname) - 1,
+			  sizeof(subpart->name));
+	strncpy(info->volname, subpart->name, label_min);
+	info->volname[label_min] = '\0';
+
+	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
+	strlcat(state->pp_buf, tmp, PAGE_SIZE);
+
+	state->parts[slot].has_info = true;
+
+	return 0;
+}
+
+static int __init cmdline_parts_setup(char *s)
+{
+	cmdline = s;
+	return 1;
+}
+__setup("blkdevparts=", cmdline_parts_setup);
+
+/*
+ * Purpose: allocate cmdline partitions.
+ * Returns:
+ * -1 if unable to read the partition table
+ *  0 if this isn't our partition table
+ *  1 if successful
+ */
+int cmdline_partition(struct parsed_partitions *state)
+{
+	sector_t disk_size;
+	char bdev[BDEVNAME_SIZE];
+	struct cmdline_parts *parts;
+
+	if (cmdline) {
+		if (bdev_parts)
+			cmdline_parts_free(&bdev_parts);
+
+		if (cmdline_parts_parse(&bdev_parts, cmdline)) {
+			cmdline = NULL;
+			return -1;
+		}
+		cmdline = NULL;
+	}
+
+	if (!bdev_parts)
+		return 0;
+
+	bdevname(state->bdev, bdev);
+	parts = cmdline_parts_find(bdev_parts, bdev);
+	if (!parts)
+		return 0;
+
+	disk_size = get_capacity(state->bdev->bd_disk) << 9;
+
+	cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
+
+	strlcat(state->pp_buf, "\n", PAGE_SIZE);
+
+	return 1;
+}
diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h
new file mode 100644
index 000000000000..26e0f8da1414
--- /dev/null
+++ b/block/partitions/cmdline.h
@@ -0,0 +1,2 @@
+
+int cmdline_partition(struct parsed_partitions *state);
diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h
new file mode 100644
index 000000000000..98e892ef6d5a
--- /dev/null
+++ b/include/linux/cmdline-parser.h
@@ -0,0 +1,43 @@
+/*
+ * Parsing command line, get the partitions information.
+ *
+ * Written by Cai Zhiyong <caizhiyong@huawei.com>
+ *
+ */
+#ifndef CMDLINEPARSEH
+#define CMDLINEPARSEH
+
+#include <linux/blkdev.h>
+
+/* partition flags */
+#define PF_RDONLY                   0x01 /* Device is read only */
+#define PF_POWERUP_LOCK             0x02 /* Always locked after reset */
+
+struct cmdline_subpart {
+	char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */
+	sector_t from;
+	sector_t size;
+	int flags;
+	struct cmdline_subpart *next_subpart;
+};
+
+struct cmdline_parts {
+	char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */
+	unsigned int nr_subparts;
+	struct cmdline_subpart *subpart;
+	struct cmdline_parts *next_parts;
+};
+
+void cmdline_parts_free(struct cmdline_parts **parts);
+
+int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline);
+
+struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+					 const char *bdev);
+
+void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+		       int slot,
+		       int (*add_part)(int, struct cmdline_subpart *, void *),
+		       void *param);
+
+#endif /* CMDLINEPARSEH */

From c86db975c87976a234f13a0c2d7f931d8ede493b Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:20:10 -0700
Subject: [PATCH 030/303] drivers/block/mg_disk.c: make mg_times_out() static

mg_times_out() is used only in this file.  Fix the following sparse
warning:

  drivers/block/mg_disk.c:639:6: warning: symbol 'mg_times_out' was not declared. Should it be static?

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/mg_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index a56cfcd5d648..77a60bedd7a3 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -636,7 +636,7 @@ ok_to_write:
 		mg_request(host->breq);
 }
 
-void mg_times_out(unsigned long data)
+static void mg_times_out(unsigned long data)
 {
 	struct mg_host *host = (struct mg_host *)data;
 	char *name;

From e7b18ede4443c0207b9fd849cf604e67c6f38fc9 Mon Sep 17 00:00:00 2001
From: Mike Miller <mike.miller@hp.com>
Date: Wed, 11 Sep 2013 14:20:11 -0700
Subject: [PATCH 031/303] cciss: set max scatter gather entries to 32 on P600

At one time we used to set the maximum number of scatter gather elements
on all Smart Array controllers to 32.  At some point in time the
firmware began to write the "appropriate" value for each controller into
the config table.  The cciss driver would then read that and set
h->maxsgentries.

        h->maxsgentries = readl(&(h->cfgtable->MaxSGElements);

On the P600 that value is 544.  Under some workloads a significant
performance reduction may result.  This patch forces the P600 to use
only 32 scatter gather elements.  Other controllers are not affected.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Signed-off-by: Dwight (Bud) Brown <bubrown@redhat.com>
Signed-off-by: Tomas Henzl <thenzl@redhat.com>
Acked-by: Stephen M. Cameron <steve.cameron@hp.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/cciss.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 62b6c2cc80b5..d2d95ff5353b 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -4257,6 +4257,13 @@ static void cciss_find_board_params(ctlr_info_t *h)
 	cciss_get_max_perf_mode_cmds(h);
 	h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds;
 	h->maxsgentries = readl(&(h->cfgtable->MaxSGElements));
+	/*
+	 * The P600 may exhibit poor performnace under some workloads
+	 * if we use the value in the configuration table. Limit this
+	 * controller to MAXSGENTRIES (32) instead.
+	 */
+	if (h->board_id == 0x3225103C)
+		h->maxsgentries = MAXSGENTRIES;
 	/*
 	 * Limit in-command s/g elements to 32 save dma'able memory.
 	 * Howvever spec says if 0, use 31

From c07303c0af38ffb1e5fd9b5ff37d0798298a7acf Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:20:13 -0700
Subject: [PATCH 032/303] drivers/block/swim.c: remove unnecessary
 platform_set_drvdata()

The driver core clears the driver data to NULL after device_release or
on probe failure.  Thus, it is not needed to manually clear the device
driver data to NULL.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Jean Delvare <khali@linux-fr.org>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/swim.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 8ed6ccb748cf..b02d53a399f3 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -924,7 +924,6 @@ static int swim_probe(struct platform_device *dev)
 	return 0;
 
 out_kfree:
-	platform_set_drvdata(dev, NULL);
 	kfree(swd);
 out_iounmap:
 	iounmap(swim_base);
@@ -962,7 +961,6 @@ static int swim_remove(struct platform_device *dev)
 	if (res)
 		release_mem_region(res->start, resource_size(res));
 
-	platform_set_drvdata(dev, NULL);
 	kfree(swd);
 
 	return 0;

From ef0855d334e1e4af7c3e0c42146a8479ea14a5ab Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:14 -0700
Subject: [PATCH 033/303] mm: mempolicy: turn vma_set_policy() into
 vma_dup_policy()

Simple cleanup.  Every user of vma_set_policy() does the same work, this
looks a bit annoying imho.  And the new trivial helper which does
mpol_dup() + vma_set_policy() to simplify the callers.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  9 +++++++--
 kernel/fork.c             |  9 +++------
 mm/mempolicy.c            | 10 ++++++++++
 mm/mmap.c                 | 17 +++++------------
 4 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 0d7df39a5885..b2f897789838 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -91,7 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol)
 }
 
 #define vma_policy(vma) ((vma)->vm_policy)
-#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol))
 
 static inline void mpol_get(struct mempolicy *pol)
 {
@@ -126,6 +125,7 @@ struct shared_policy {
 	spinlock_t lock;
 };
 
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst);
 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol);
 int mpol_set_shared_policy(struct shared_policy *info,
 				struct vm_area_struct *vma,
@@ -240,7 +240,12 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
 }
 
 #define vma_policy(vma) NULL
-#define vma_set_policy(vma, pol) do {} while(0)
+
+static inline int
+vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+	return 0;
+}
 
 static inline void numa_policy_init(void)
 {
diff --git a/kernel/fork.c b/kernel/fork.c
index 84703db06cf3..81ccb4f010c2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
-	struct mempolicy *pol;
 
 	uprobe_start_dup_mmap();
 	down_write(&oldmm->mmap_sem);
@@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto fail_nomem;
 		*tmp = *mpnt;
 		INIT_LIST_HEAD(&tmp->anon_vma_chain);
-		pol = mpol_dup(vma_policy(mpnt));
-		retval = PTR_ERR(pol);
-		if (IS_ERR(pol))
+		retval = vma_dup_policy(mpnt, tmp);
+		if (retval)
 			goto fail_nomem_policy;
-		vma_set_policy(tmp, pol);
 		tmp->vm_mm = mm;
 		if (anon_vma_fork(tmp, mpnt))
 			goto fail_nomem_anon_vma_fork;
@@ -472,7 +469,7 @@ out:
 	uprobe_end_dup_mmap();
 	return retval;
 fail_nomem_anon_vma_fork:
-	mpol_put(pol);
+	mpol_put(vma_policy(tmp));
 fail_nomem_policy:
 	kmem_cache_free(vm_area_cachep, tmp);
 fail_nomem:
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4baf12e534d1..6b1d426731ae 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2065,6 +2065,16 @@ retry_cpuset:
 }
 EXPORT_SYMBOL(alloc_pages_current);
 
+int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
+{
+	struct mempolicy *pol = mpol_dup(vma_policy(src));
+
+	if (IS_ERR(pol))
+		return PTR_ERR(pol);
+	dst->vm_policy = pol;
+	return 0;
+}
+
 /*
  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
diff --git a/mm/mmap.c b/mm/mmap.c
index f9c97d10b873..14f6bb4830f7 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2380,7 +2380,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 	      unsigned long addr, int new_below)
 {
-	struct mempolicy *pol;
 	struct vm_area_struct *new;
 	int err = -ENOMEM;
 
@@ -2404,12 +2403,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
 	}
 
-	pol = mpol_dup(vma_policy(vma));
-	if (IS_ERR(pol)) {
-		err = PTR_ERR(pol);
+	err = vma_dup_policy(vma, new);
+	if (err)
 		goto out_free_vma;
-	}
-	vma_set_policy(new, pol);
 
 	if (anon_vma_clone(new, vma))
 		goto out_free_mpol;
@@ -2437,7 +2433,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 		fput(new->vm_file);
 	unlink_anon_vmas(new);
  out_free_mpol:
-	mpol_put(pol);
+	mpol_put(vma_policy(new));
  out_free_vma:
 	kmem_cache_free(vm_area_cachep, new);
  out_err:
@@ -2780,7 +2776,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	struct mm_struct *mm = vma->vm_mm;
 	struct vm_area_struct *new_vma, *prev;
 	struct rb_node **rb_link, *rb_parent;
-	struct mempolicy *pol;
 	bool faulted_in_anon_vma = true;
 
 	/*
@@ -2825,10 +2820,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
 			new_vma->vm_pgoff = pgoff;
-			pol = mpol_dup(vma_policy(vma));
-			if (IS_ERR(pol))
+			if (vma_dup_policy(vma, new_vma))
 				goto out_free_vma;
-			vma_set_policy(new_vma, pol);
 			INIT_LIST_HEAD(&new_vma->anon_vma_chain);
 			if (anon_vma_clone(new_vma, vma))
 				goto out_free_mempol;
@@ -2843,7 +2836,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	return new_vma;
 
  out_free_mempol:
-	mpol_put(pol);
+	mpol_put(vma_policy(new_vma));
  out_free_vma:
 	kmem_cache_free(vm_area_cachep, new_vma);
 	return NULL;

From ec9bed9d385fd094b20fa0809c50741710afdc74 Mon Sep 17 00:00:00 2001
From: Vladimir Cernov <gg.kaspersky@gmail.com>
Date: Wed, 11 Sep 2013 14:20:15 -0700
Subject: [PATCH 034/303] mm/madvise.c: fix coding-style errors

This fixes following errors:
	- ERROR: "(foo*)" should be "(foo *)"
	- ERROR: "foo ** bar" should be "foo **bar"

Signed-off-by: Vladimir Cernov <gg.kaspersky@gmail.com>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 7055883e6e25..936799f042cc 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior)
  * We can potentially split a vm area into separate
  * areas, each area with its own behavior.
  */
-static long madvise_behavior(struct vm_area_struct * vma,
+static long madvise_behavior(struct vm_area_struct *vma,
 		     struct vm_area_struct **prev,
 		     unsigned long start, unsigned long end, int behavior)
 {
-	struct mm_struct * mm = vma->vm_mm;
+	struct mm_struct *mm = vma->vm_mm;
 	int error = 0;
 	pgoff_t pgoff;
 	unsigned long new_flags = vma->vm_flags;
@@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma,
 /*
  * Schedule all required I/O operations.  Do not wait for completion.
  */
-static long madvise_willneed(struct vm_area_struct * vma,
-			     struct vm_area_struct ** prev,
+static long madvise_willneed(struct vm_area_struct *vma,
+			     struct vm_area_struct **prev,
 			     unsigned long start, unsigned long end)
 {
 	struct file *file = vma->vm_file;
@@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma,
  * An interface that causes the system to free clean pages and flush
  * dirty pages is already available as msync(MS_INVALIDATE).
  */
-static long madvise_dontneed(struct vm_area_struct * vma,
-			     struct vm_area_struct ** prev,
+static long madvise_dontneed(struct vm_area_struct *vma,
+			     struct vm_area_struct **prev,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
@@ -459,7 +459,7 @@ madvise_behavior_valid(int behavior)
 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 {
 	unsigned long end, tmp;
-	struct vm_area_struct * vma, *prev;
+	struct vm_area_struct *vma, *prev;
 	int unmapped_error = 0;
 	int error = -EINVAL;
 	int write;

From d6bbbd29b1de2da807753be8a3a992a72aef42de Mon Sep 17 00:00:00 2001
From: Raymond Jennings <shentino@gmail.com>
Date: Wed, 11 Sep 2013 14:20:16 -0700
Subject: [PATCH 035/303] swap: warn when a swap area overflows the maximum
 size

It is possible to swapon a swap area that is too big for the pte width
to handle.

Presently this failure happens silently.

Instead, emit a diagnostic to warn the user.

Testing results, root prompt commands and kernel log messages:

# lvresize /dev/system/swap --size 16G
# mkswap /dev/system/swap
# swapon /dev/system/swap

Jul  7 04:27:22 warfang kernel: Adding 16777212k swap
on /dev/mapper/system-swap.  Priority:-1 extents:1 across:16777212k

# lvresize /dev/system/swap --size 64G
# mkswap /dev/system/swap
# swapon /dev/system/swap

Jul  7 04:27:22 warfang kernel: Truncating oversized swap area, only
using 33554432k out of 67108860k
Jul  7 04:27:22 warfang kernel: Adding 33554428k swap
on /dev/mapper/system-swap.  Priority:-1 extents:1 across:33554428k

[akpm@linux-foundation.org: fix warning]
Signed-off-by: Raymond Jennings <shentino@gmail.com>
Acked-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6cf2e60983b7..b5212eea6c3c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1926,6 +1926,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	int i;
 	unsigned long maxpages;
 	unsigned long swapfilepages;
+	unsigned long last_page;
 
 	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
 		printk(KERN_ERR "Unable to find swap-space signature\n");
@@ -1968,8 +1969,15 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	 */
 	maxpages = swp_offset(pte_to_swp_entry(
 			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
-	if (maxpages > swap_header->info.last_page) {
-		maxpages = swap_header->info.last_page + 1;
+	last_page = swap_header->info.last_page;
+	if (last_page > maxpages) {
+		printk(KERN_WARNING
+			"Truncating oversized swap area, only using %luk out of %luk\n",
+			maxpages << (PAGE_SHIFT - 10),
+			last_page << (PAGE_SHIFT - 10));
+	}
+	if (maxpages > last_page) {
+		maxpages = last_page + 1;
 		/* p->max is an unsigned int: don't overflow it */
 		if ((unsigned int)maxpages == 0)
 			maxpages = UINT_MAX;

From 465c47fd8dc44302fed6c4eab8927464744ce08c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 11 Sep 2013 14:20:17 -0700
Subject: [PATCH 036/303] mm/swapfile.c: convert to pr_foo()

A few 80-col gymnastics were cleaned up as a result.

Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index b5212eea6c3c..6ef2d15c5fe3 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -527,16 +527,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 	return p;
 
 bad_free:
-	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
+	pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val);
 	goto out;
 bad_offset:
-	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val);
+	pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val);
 	goto out;
 bad_device:
-	printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val);
+	pr_err("swap_free: %s%08lx\n", Unused_file, entry.val);
 	goto out;
 bad_nofile:
-	printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
+	pr_err("swap_free: %s%08lx\n", Bad_file, entry.val);
 out:
 	return NULL;
 }
@@ -1929,7 +1929,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	unsigned long last_page;
 
 	if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
-		printk(KERN_ERR "Unable to find swap-space signature\n");
+		pr_err("Unable to find swap-space signature\n");
 		return 0;
 	}
 
@@ -1943,9 +1943,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 	}
 	/* Check the swap header's sub-version */
 	if (swap_header->info.version != 1) {
-		printk(KERN_WARNING
-		       "Unable to handle swap header version %d\n",
-		       swap_header->info.version);
+		pr_warn("Unable to handle swap header version %d\n",
+			swap_header->info.version);
 		return 0;
 	}
 
@@ -1971,8 +1970,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 			swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
 	last_page = swap_header->info.last_page;
 	if (last_page > maxpages) {
-		printk(KERN_WARNING
-			"Truncating oversized swap area, only using %luk out of %luk\n",
+		pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
 			maxpages << (PAGE_SHIFT - 10),
 			last_page << (PAGE_SHIFT - 10));
 	}
@@ -1988,8 +1986,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 		return 0;
 	swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
 	if (swapfilepages && maxpages > swapfilepages) {
-		printk(KERN_WARNING
-		       "Swap area shorter than signature indicates\n");
+		pr_warn("Swap area shorter than signature indicates\n");
 		return 0;
 	}
 	if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
@@ -2032,7 +2029,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 		nr_good_pages = p->pages;
 	}
 	if (!nr_good_pages) {
-		printk(KERN_WARNING "Empty swap-file\n");
+		pr_warn("Empty swap-file\n");
 		return -EINVAL;
 	}
 
@@ -2186,8 +2183,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			if (p->flags & SWP_AREA_DISCARD) {
 				int err = discard_swap(p);
 				if (unlikely(err))
-					printk(KERN_ERR
-					       "swapon: discard_swap(%p): %d\n",
+					pr_err("swapon: discard_swap(%p): %d\n",
 						p, err);
 			}
 		}
@@ -2200,7 +2196,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
 	enable_swap_info(p, prio, swap_map, frontswap_map);
 
-	printk(KERN_INFO "Adding %uk swap on %s.  "
+	pr_info("Adding %uk swap on %s.  "
 			"Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
 		p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
 		nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
@@ -2334,7 +2330,7 @@ out:
 	return err;
 
 bad_file:
-	printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
+	pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
 	goto out;
 }
 

From b2c56e4f7d93be3f33a82ec66f0d0f46713ff5f1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:18 -0700
Subject: [PATCH 037/303] mm: shift VM_GROWS* check from mmap_region() to
 do_mmap_pgoff()

mmap() doesn't allow the non-anonymous mappings with VM_GROWS* bit set.
In particular this means that mmap_region()->vma_merge(file, vm_flags)
must always fail if "vm_flags & VM_GROWS" is set incorrectly.

So it does not make sense to check VM_GROWS* after we already allocated
the new vma, the only caller, do_mmap_pgoff(), which can pass this flag
can do the check itself.

And this looks a bit more correct, mmap_region() already unmapped the
old mapping at this stage. But if mmap() is going to fail, it should
avoid do_munmap() if possible.

Note: we check VM_GROWS at the end to ensure that do_mmap_pgoff() won't
return EINVAL in the case when it currently returns another error code.

Many thanks to Hugh who nacked the buggy v1.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 14f6bb4830f7..6cff7ba24a34 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1302,6 +1302,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 
 			if (!file->f_op || !file->f_op->mmap)
 				return -ENODEV;
+			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+				return -EINVAL;
 			break;
 
 		default:
@@ -1310,6 +1312,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	} else {
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
+			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
+				return -EINVAL;
 			/*
 			 * Ignore pgoff.
 			 */
@@ -1544,11 +1548,7 @@ munmap_back:
 	vma->vm_pgoff = pgoff;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
-	error = -EINVAL;	/* when rejecting VM_GROWSDOWN|VM_GROWSUP */
-
 	if (file) {
-		if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
-			goto free_vma;
 		if (vm_flags & VM_DENYWRITE) {
 			error = deny_write_access(file);
 			if (error)
@@ -1573,8 +1573,6 @@ munmap_back:
 		pgoff = vma->vm_pgoff;
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
-		if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP)))
-			goto free_vma;
 		error = shmem_zero_setup(vma);
 		if (error)
 			goto free_vma;

From 077bf22b5cf233863826afbfa4af9b18650a832d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:19 -0700
Subject: [PATCH 038/303] mm: do_mmap_pgoff: cleanup the usage of file_inode()

Simple cleanup.  Move "struct inode *inode" variable into "if (file)"
block to simplify the code and avoid the unnecessary check.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@android.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 6cff7ba24a34..1e7a3ea23f1a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			unsigned long *populate)
 {
 	struct mm_struct * mm = current->mm;
-	struct inode *inode;
 	vm_flags_t vm_flags;
 
 	*populate = 0;
@@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 			return -EAGAIN;
 	}
 
-	inode = file ? file_inode(file) : NULL;
-
 	if (file) {
+		struct inode *inode = file_inode(file);
+
 		switch (flags & MAP_TYPE) {
 		case MAP_SHARED:
 			if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))

From e86867720e617774b560dfbc169b7f3d0d490950 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:20:20 -0700
Subject: [PATCH 039/303] mm: mmap_region: kill correct_wcount/inode, use
 allow_write_access()

correct_wcount and inode in mmap_region() just complicate the code.  This
boolean was needed previously, when deny_write_access() was called before
vma_merge(), now we can simply check VM_DENYWRITE and do
allow_write_access() if it is set.

allow_write_access() checks file != NULL, so this is safe even if it was
possible to use VM_DENYWRITE && !file.  Just we need to ensure we use the
same file which was deny_write_access()'ed, so the patch also moves "file
= vma->vm_file" down after allow_write_access().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Colin Cross <ccross@android.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 1e7a3ea23f1a..13926a5a6901 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1479,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
-	int correct_wcount = 0;
 	int error;
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
-	struct inode *inode =  file ? file_inode(file) : NULL;
 
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1552,7 +1550,6 @@ munmap_back:
 			error = deny_write_access(file);
 			if (error)
 				goto free_vma;
-			correct_wcount = 1;
 		}
 		vma->vm_file = get_file(file);
 		error = file->f_op->mmap(file, vma);
@@ -1593,11 +1590,10 @@ munmap_back:
 	}
 
 	vma_link(mm, vma, prev, rb_link, rb_parent);
-	file = vma->vm_file;
-
 	/* Once vma denies write, undo our temporary denial count */
-	if (correct_wcount)
-		atomic_inc(&inode->i_writecount);
+	if (vm_flags & VM_DENYWRITE)
+		allow_write_access(file);
+	file = vma->vm_file;
 out:
 	perf_event_mmap(vma);
 
@@ -1616,8 +1612,8 @@ out:
 	return addr;
 
 unmap_and_free_vma:
-	if (correct_wcount)
-		atomic_inc(&inode->i_writecount);
+	if (vm_flags & VM_DENYWRITE)
+		allow_write_access(file);
 	vma->vm_file = NULL;
 	fput(file);
 

From 822518dc56810a0de44cff0f85a227268818749c Mon Sep 17 00:00:00 2001
From: Sunghan Suh <sunghan.suh@samsung.com>
Date: Wed, 11 Sep 2013 14:20:22 -0700
Subject: [PATCH 040/303] mm/zswap.c: get swapper address_space by using macro

There is a proper macro to get the corresponding swapper address space
from a swap entry.  Instead of directly accessing "swapper_spaces" array,
use the "swap_address_space" macro.

Signed-off-by: Sunghan Suh <sunghan.suh@samsung.com>
Reviewed-by: Bob Liu <bob.liu@oracle.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index deda2b671e12..efed4c8b7f5b 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
 				struct page **retpage)
 {
 	struct page *found_page, *new_page = NULL;
-	struct address_space *swapper_space = &swapper_spaces[swp_type(entry)];
+	struct address_space *swapper_space = swap_address_space(entry);
 	int err;
 
 	*retpage = NULL;

From 9824cf9753ecbe8f5b47aa9b2f218207defea211 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Wed, 11 Sep 2013 14:20:23 -0700
Subject: [PATCH 041/303] mm: vmstats: tlb flush counters

I was investigating some TLB flush scaling issues and realized that we do
not have any good methods for figuring out how many TLB flushes we are
doing.

It would be nice to be able to do these in generic code, but the
arch-independent calls don't explicitly specify whether we actually need
to do remote flushes or not.  In the end, we really need to know if we
actually _did_ global vs.  local invalidations, so that leaves us with few
options other than to muck with the counters from arch-specific code.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/tlb.c             | 18 ++++++++++++++----
 include/linux/vm_event_item.h |  5 +++++
 mm/vmstat.c                   |  5 +++++
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 282375f13c7e..f030cbe669a5 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -103,6 +103,7 @@ static void flush_tlb_func(void *info)
 	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
 		return;
 
+	count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
 		if (f->flush_end == TLB_FLUSH_ALL)
 			local_flush_tlb();
@@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	info.flush_start = start;
 	info.flush_end = end;
 
+	count_vm_event(NR_TLB_REMOTE_FLUSH);
 	if (is_uv_system()) {
 		unsigned int cpu;
 
@@ -149,6 +151,7 @@ void flush_tlb_current_task(void)
 
 	preempt_disable();
 
+	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
 	local_flush_tlb();
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm;
 
 	/* tlb_flushall_shift is on balance point, details in commit log */
-	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift)
+	if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) {
+		count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
 		local_flush_tlb();
-	else {
+	} else {
 		if (has_large_page(mm, start, end)) {
 			local_flush_tlb();
 			goto flush_all;
 		}
 		/* flush range by one by one 'invlpg' */
-		for (addr = start; addr < end;	addr += PAGE_SIZE)
+		for (addr = start; addr < end;	addr += PAGE_SIZE) {
+			count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
 			__flush_tlb_single(addr);
+		}
 
 		if (cpumask_any_but(mm_cpumask(mm),
 				smp_processor_id()) < nr_cpu_ids)
@@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 
 static void do_flush_tlb_all(void *info)
 {
+	count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
 	__flush_tlb_all();
 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 		leave_mm(smp_processor_id());
@@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
+	count_vm_event(NR_TLB_REMOTE_FLUSH);
 	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
@@ -272,8 +280,10 @@ static void do_kernel_range_flush(void *info)
 	unsigned long addr;
 
 	/* flush range by one by one 'invlpg' */
-	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
+	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) {
+		count_vm_event(NR_TLB_LOCAL_FLUSH_ONE_KERNEL);
 		__flush_tlb_single(addr);
+	}
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index bd6cf61142be..dc2cdf07ac14 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -70,6 +70,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
 #endif
+		NR_TLB_REMOTE_FLUSH,	/* cpu tried to flush others' tlbs */
+		NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
+		NR_TLB_LOCAL_FLUSH_ALL,
+		NR_TLB_LOCAL_FLUSH_ONE,
+		NR_TLB_LOCAL_FLUSH_ONE_KERNEL,
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 20c2ef4458fa..00382c53f582 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -817,6 +817,11 @@ const char * const vmstat_text[] = {
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
 #endif
+	"nr_tlb_remote_flush",
+	"nr_tlb_remote_flush_received",
+	"nr_tlb_local_flush_all",
+	"nr_tlb_local_flush_one",
+	"nr_tlb_local_flush_one_kernel",
 
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };

From 6df46865ff8715932e7d42e52cac17e8461758cb Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Wed, 11 Sep 2013 14:20:24 -0700
Subject: [PATCH 042/303] mm: vmstats: track TLB flush stats on UP too

The previous patch doing vmstats for TLB flushes ("mm: vmstats: tlb flush
counters") effectively missed UP since arch/x86/mm/tlb.c is only compiled
for SMP.

UP systems do not do remote TLB flushes, so compile those counters out on
UP.

arch/x86/kernel/cpu/mtrr/generic.c calls __flush_tlb() directly.  This is
probably an optimization since both the mtrr code and __flush_tlb() write
cr4.  It would probably be safe to make that a flush_tlb_all() (and then
get these statistics), but the mtrr code is ancient and I'm hesitant to
touch it other than to just stick in the counters.

[akpm@linux-foundation.org: tweak comments]
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/tlbflush.h    | 37 +++++++++++++++++++++++++-----
 arch/x86/kernel/cpu/mtrr/generic.c |  2 ++
 arch/x86/mm/tlb.c                  |  4 +---
 include/linux/vm_event_item.h      |  3 ++-
 mm/vmstat.c                        |  3 ++-
 5 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cf512003e663..e6d90babc245 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void)
 
 static inline void __flush_tlb_one(unsigned long addr)
 {
+	count_vm_event(NR_TLB_LOCAL_FLUSH_ONE);
 	__flush_tlb_single(addr);
 }
 
@@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr)
 
 #ifndef CONFIG_SMP
 
-#define flush_tlb() __flush_tlb()
-#define flush_tlb_all() __flush_tlb_all()
-#define local_flush_tlb() __flush_tlb()
+/* "_up" is for UniProcessor.
+ *
+ * This is a helper for other header functions.  *Not* intended to be called
+ * directly.  All global TLB flushes need to either call this, or to bump the
+ * vm statistics themselves.
+ */
+static inline void __flush_tlb_up(void)
+{
+	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+	__flush_tlb();
+}
+
+static inline void flush_tlb_all(void)
+{
+	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
+	__flush_tlb_all();
+}
+
+static inline void flush_tlb(void)
+{
+	__flush_tlb_up();
+}
+
+static inline void local_flush_tlb(void)
+{
+	__flush_tlb_up();
+}
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
 	if (mm == current->active_mm)
-		__flush_tlb();
+		__flush_tlb_up();
 }
 
 static inline void flush_tlb_page(struct vm_area_struct *vma,
@@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {
 	if (vma->vm_mm == current->active_mm)
-		__flush_tlb();
+		__flush_tlb_up();
 }
 
 static inline void flush_tlb_mm_range(struct mm_struct *mm,
 	   unsigned long start, unsigned long end, unsigned long vmflag)
 {
 	if (mm == current->active_mm)
-		__flush_tlb();
+		__flush_tlb_up();
 }
 
 static inline void native_flush_tlb_others(const struct cpumask *cpumask,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index d4cdfa67509e..ce2d0a2c3e4f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 	}
 
 	/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
+	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
 	__flush_tlb();
 
 	/* Save MTRR state */
@@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 static void post_set(void) __releases(set_atomicity_lock)
 {
 	/* Flush TLBs (no need to flush caches - they are disabled) */
+	count_vm_event(NR_TLB_LOCAL_FLUSH_ALL);
 	__flush_tlb();
 
 	/* Intel (P6) standard MTRRs */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f030cbe669a5..ae699b3bbac8 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -280,10 +280,8 @@ static void do_kernel_range_flush(void *info)
 	unsigned long addr;
 
 	/* flush range by one by one 'invlpg' */
-	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) {
-		count_vm_event(NR_TLB_LOCAL_FLUSH_ONE_KERNEL);
+	for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE)
 		__flush_tlb_single(addr);
-	}
 }
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index dc2cdf07ac14..1855f0a22add 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -70,11 +70,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		THP_ZERO_PAGE_ALLOC,
 		THP_ZERO_PAGE_ALLOC_FAILED,
 #endif
+#ifdef CONFIG_SMP
 		NR_TLB_REMOTE_FLUSH,	/* cpu tried to flush others' tlbs */
 		NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
+#endif
 		NR_TLB_LOCAL_FLUSH_ALL,
 		NR_TLB_LOCAL_FLUSH_ONE,
-		NR_TLB_LOCAL_FLUSH_ONE_KERNEL,
 		NR_VM_EVENT_ITEMS
 };
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 00382c53f582..ca06e9653827 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -817,11 +817,12 @@ const char * const vmstat_text[] = {
 	"thp_zero_page_alloc",
 	"thp_zero_page_alloc_failed",
 #endif
+#ifdef CONFIG_SMP
 	"nr_tlb_remote_flush",
 	"nr_tlb_remote_flush_received",
+#endif
 	"nr_tlb_local_flush_all",
 	"nr_tlb_local_flush_one",
-	"nr_tlb_local_flush_one_kernel",
 
 #endif /* CONFIG_VM_EVENTS_COUNTERS */
 };

From 3dbb95f7895e378514ffefa93cc887fb1bc9df94 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:20:25 -0700
Subject: [PATCH 043/303] mm: replace strict_strtoul() with kstrtoul()

The use of strict_strtoul() is not preferred, because strict_strtoul() is
obsolete.  Thus, kstrtoul() should be used.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 8 ++++----
 mm/hugetlb.c     | 4 ++--
 mm/kmemleak.c    | 2 +-
 mm/ksm.c         | 6 +++---
 mm/slub.c        | 8 ++++----
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a92012a71702..8b7fc2025e04 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -417,7 +417,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
 	unsigned long msecs;
 	int err;
 
-	err = strict_strtoul(buf, 10, &msecs);
+	err = kstrtoul(buf, 10, &msecs);
 	if (err || msecs > UINT_MAX)
 		return -EINVAL;
 
@@ -444,7 +444,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
 	unsigned long msecs;
 	int err;
 
-	err = strict_strtoul(buf, 10, &msecs);
+	err = kstrtoul(buf, 10, &msecs);
 	if (err || msecs > UINT_MAX)
 		return -EINVAL;
 
@@ -470,7 +470,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
 	int err;
 	unsigned long pages;
 
-	err = strict_strtoul(buf, 10, &pages);
+	err = kstrtoul(buf, 10, &pages);
 	if (err || !pages || pages > UINT_MAX)
 		return -EINVAL;
 
@@ -538,7 +538,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
 	int err;
 	unsigned long max_ptes_none;
 
-	err = strict_strtoul(buf, 10, &max_ptes_none);
+	err = kstrtoul(buf, 10, &max_ptes_none);
 	if (err || max_ptes_none > HPAGE_PMD_NR-1)
 		return -EINVAL;
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b60f33080a28..6e514831bda5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1526,7 +1526,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
 	struct hstate *h;
 	NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
 
-	err = strict_strtoul(buf, 10, &count);
+	err = kstrtoul(buf, 10, &count);
 	if (err)
 		goto out;
 
@@ -1617,7 +1617,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
 	if (h->order >= MAX_ORDER)
 		return -EINVAL;
 
-	err = strict_strtoul(buf, 10, &input);
+	err = kstrtoul(buf, 10, &input);
 	if (err)
 		return err;
 
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index c8d7f3110fd0..e126b0ef9ad2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
 	else if (strncmp(buf, "scan=", 5) == 0) {
 		unsigned long secs;
 
-		ret = strict_strtoul(buf + 5, 0, &secs);
+		ret = kstrtoul(buf + 5, 0, &secs);
 		if (ret < 0)
 			goto out;
 		stop_scan_thread();
diff --git a/mm/ksm.c b/mm/ksm.c
index b6afe0c440d8..0bea2b262a47 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj,
 	unsigned long msecs;
 	int err;
 
-	err = strict_strtoul(buf, 10, &msecs);
+	err = kstrtoul(buf, 10, &msecs);
 	if (err || msecs > UINT_MAX)
 		return -EINVAL;
 
@@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj,
 	int err;
 	unsigned long nr_pages;
 
-	err = strict_strtoul(buf, 10, &nr_pages);
+	err = kstrtoul(buf, 10, &nr_pages);
 	if (err || nr_pages > UINT_MAX)
 		return -EINVAL;
 
@@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
 	int err;
 	unsigned long flags;
 
-	err = strict_strtoul(buf, 10, &flags);
+	err = kstrtoul(buf, 10, &flags);
 	if (err || flags > UINT_MAX)
 		return -EINVAL;
 	if (flags > KSM_RUN_UNMERGE)
diff --git a/mm/slub.c b/mm/slub.c
index e3ba1f2cf60c..51df8272cfaf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s,
 	unsigned long order;
 	int err;
 
-	err = strict_strtoul(buf, 10, &order);
+	err = kstrtoul(buf, 10, &order);
 	if (err)
 		return err;
 
@@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
 	unsigned long min;
 	int err;
 
-	err = strict_strtoul(buf, 10, &min);
+	err = kstrtoul(buf, 10, &min);
 	if (err)
 		return err;
 
@@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
 	unsigned long objects;
 	int err;
 
-	err = strict_strtoul(buf, 10, &objects);
+	err = kstrtoul(buf, 10, &objects);
 	if (err)
 		return err;
 	if (objects && !kmem_cache_has_cpu_partial(s))
@@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
 	unsigned long ratio;
 	int err;
 
-	err = strict_strtoul(buf, 10, &ratio);
+	err = kstrtoul(buf, 10, &ratio);
 	if (err)
 		return err;
 

From a7e833182a926ae5bc03204dbd00b0bb5539088b Mon Sep 17 00:00:00 2001
From: Jerry Zhou <uulinux@gmail.com>
Date: Wed, 11 Sep 2013 14:20:26 -0700
Subject: [PATCH 044/303] mm: fix negative left shift count when PAGE_SHIFT >
 20

When PAGE_SHIFT > 20, the result of "20 - PAGE_SHIFT" is negative. The
previous calculating here will generate an unexpected result. In
addition, if PAGE_SIZE >= 1MB, The memory size of "numentries" was
already integral multiple of 1MB.

Signed-off-by: Jerry Zhou <uulinux@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c2b59dbda196..116bab1c2cf5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5745,9 +5745,10 @@ void *__init alloc_large_system_hash(const char *tablename,
 	if (!numentries) {
 		/* round applicable memory size up to nearest megabyte */
 		numentries = nr_kernel_pages;
-		numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
-		numentries >>= 20 - PAGE_SHIFT;
-		numentries <<= 20 - PAGE_SHIFT;
+
+		/* It isn't necessary when PAGE_SIZE >= 1MB */
+		if (PAGE_SHIFT < 20)
+			numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
 
 		/* limit to 1 bucket per 2^scale bytes of low memory */
 		if (scale > PAGE_SHIFT)

From 15ca220e1a63af06e000691e4ae1beaba5430c32 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 11 Sep 2013 14:20:27 -0700
Subject: [PATCH 045/303] mm/page_alloc.c: use '__paginginit' instead of
 '__init'

set_pageblock_order() may be called when memory hotplug, so need use
'__paginginit' instead of '__init'.

The related warning:

  The function __meminit .free_area_init_node() references
  a function __init .set_pageblock_order().
  If .set_pageblock_order is only used by .free_area_init_node then
  annotate .set_pageblock_order with a matching annotation.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 116bab1c2cf5..6cf157637df3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4586,7 +4586,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 
 /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
-void __init set_pageblock_order(void)
+void __paginginit set_pageblock_order(void)
 {
 	unsigned int order;
 
@@ -4614,7 +4614,7 @@ void __init set_pageblock_order(void)
  * include/linux/pageblock-flags.h for the values of pageblock_order based on
  * the kernel config
  */
-void __init set_pageblock_order(void)
+void __paginginit set_pageblock_order(void)
 {
 }
 

From 2a8f9449343260373398d59228a62a4332ea513a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 11 Sep 2013 14:20:28 -0700
Subject: [PATCH 046/303] swap: change block allocation algorithm for SSD

I'm using a fast SSD to do swap.  scan_swap_map() sometimes uses up to
20~30% CPU time (when cluster is hard to find, the CPU time can be up to
80%), which becomes a bottleneck.  scan_swap_map() scans a byte array to
search a 256 page cluster, which is very slow.

Here I introduced a simple algorithm to search cluster.  Since we only
care about 256 pages cluster, we can just use a counter to track if a
cluster is free.  Every 256 pages use one int to store the counter.  If
the counter of a cluster is 0, the cluster is free.  All free clusters
will be added to a list, so searching cluster is very efficient.  With
this, scap_swap_map() overhead disappears.

This might help low end SD card swap too.  Because if the cluster is
aligned, SD firmware can do flash erase more efficiently.

We only enable the algorithm for SSD.  Hard disk swap isn't fast enough
and has downside with the algorithm which might introduce regression (see
below).

The patch slightly changes which cluster is choosen.  It always adds free
cluster to list tail.  This can help wear leveling for low end SSD too.
And if no cluster found, the scan_swap_map() will do search from the end
of last cluster.  So if no cluster found, the scan_swap_map() will do
search from the end of last free cluster, which is random.  For SSD, this
isn't a problem at all.

Another downside is the cluster must be aligned to 256 pages, which will
reduce the chance to find a cluster.  I would expect this isn't a big
problem for SSD because of the non-seek penality.  (And this is the reason
I only enable the algorithm for SSD).

Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  20 +++
 mm/swapfile.c        | 286 +++++++++++++++++++++++++++++++++++++------
 2 files changed, 269 insertions(+), 37 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index d95cde5e257d..cb5baebf31d6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -181,6 +181,23 @@ enum {
 #define COUNT_CONTINUED	0x80	/* See swap_map continuation for full count */
 #define SWAP_MAP_SHMEM	0xbf	/* Owned by shmem/tmpfs, in first swap_map */
 
+/*
+ * We use this to track usage of a cluster. A cluster is a block of swap disk
+ * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All
+ * free clusters are organized into a list. We fetch an entry from the list to
+ * get a free cluster.
+ *
+ * The data field stores next cluster if the cluster is free or cluster usage
+ * counter otherwise. The flags field determines if a cluster is free. This is
+ * protected by swap_info_struct.lock.
+ */
+struct swap_cluster_info {
+	unsigned int data:24;
+	unsigned int flags:8;
+};
+#define CLUSTER_FLAG_FREE 1 /* This cluster is free */
+#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
+
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -191,6 +208,9 @@ struct swap_info_struct {
 	signed char	next;		/* next type on the swap list */
 	unsigned int	max;		/* extent of the swap_map */
 	unsigned char *swap_map;	/* vmalloc'ed array of usage counts */
+	struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
+	struct swap_cluster_info free_cluster_head; /* free cluster list head */
+	struct swap_cluster_info free_cluster_tail; /* free cluster list tail */
 	unsigned int lowest_bit;	/* index of first free in swap_map */
 	unsigned int highest_bit;	/* index of last free in swap_map */
 	unsigned int pages;		/* total of usable pages of swap */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6ef2d15c5fe3..d1fbeb486de5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -184,6 +184,134 @@ static int wait_for_discard(void *word)
 #define SWAPFILE_CLUSTER	256
 #define LATENCY_LIMIT		256
 
+static inline void cluster_set_flag(struct swap_cluster_info *info,
+	unsigned int flag)
+{
+	info->flags = flag;
+}
+
+static inline unsigned int cluster_count(struct swap_cluster_info *info)
+{
+	return info->data;
+}
+
+static inline void cluster_set_count(struct swap_cluster_info *info,
+				     unsigned int c)
+{
+	info->data = c;
+}
+
+static inline void cluster_set_count_flag(struct swap_cluster_info *info,
+					 unsigned int c, unsigned int f)
+{
+	info->flags = f;
+	info->data = c;
+}
+
+static inline unsigned int cluster_next(struct swap_cluster_info *info)
+{
+	return info->data;
+}
+
+static inline void cluster_set_next(struct swap_cluster_info *info,
+				    unsigned int n)
+{
+	info->data = n;
+}
+
+static inline void cluster_set_next_flag(struct swap_cluster_info *info,
+					 unsigned int n, unsigned int f)
+{
+	info->flags = f;
+	info->data = n;
+}
+
+static inline bool cluster_is_free(struct swap_cluster_info *info)
+{
+	return info->flags & CLUSTER_FLAG_FREE;
+}
+
+static inline bool cluster_is_null(struct swap_cluster_info *info)
+{
+	return info->flags & CLUSTER_FLAG_NEXT_NULL;
+}
+
+static inline void cluster_set_null(struct swap_cluster_info *info)
+{
+	info->flags = CLUSTER_FLAG_NEXT_NULL;
+	info->data = 0;
+}
+
+/*
+ * The cluster corresponding to page_nr will be used. The cluster will be
+ * removed from free cluster list and its usage counter will be increased.
+ */
+static void inc_cluster_info_page(struct swap_info_struct *p,
+	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+	if (!cluster_info)
+		return;
+	if (cluster_is_free(&cluster_info[idx])) {
+		VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx);
+		cluster_set_next_flag(&p->free_cluster_head,
+			cluster_next(&cluster_info[idx]), 0);
+		if (cluster_next(&p->free_cluster_tail) == idx) {
+			cluster_set_null(&p->free_cluster_tail);
+			cluster_set_null(&p->free_cluster_head);
+		}
+		cluster_set_count_flag(&cluster_info[idx], 0, 0);
+	}
+
+	VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
+	cluster_set_count(&cluster_info[idx],
+		cluster_count(&cluster_info[idx]) + 1);
+}
+
+/*
+ * The cluster corresponding to page_nr decreases one usage. If the usage
+ * counter becomes 0, which means no page in the cluster is in using, we can
+ * optionally discard the cluster and add it to free cluster list.
+ */
+static void dec_cluster_info_page(struct swap_info_struct *p,
+	struct swap_cluster_info *cluster_info, unsigned long page_nr)
+{
+	unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+
+	if (!cluster_info)
+		return;
+
+	VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
+	cluster_set_count(&cluster_info[idx],
+		cluster_count(&cluster_info[idx]) - 1);
+
+	if (cluster_count(&cluster_info[idx]) == 0) {
+		cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+		if (cluster_is_null(&p->free_cluster_head)) {
+			cluster_set_next_flag(&p->free_cluster_head, idx, 0);
+			cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+		} else {
+			unsigned int tail = cluster_next(&p->free_cluster_tail);
+			cluster_set_next(&cluster_info[tail], idx);
+			cluster_set_next_flag(&p->free_cluster_tail, idx, 0);
+		}
+	}
+}
+
+/*
+ * It's possible scan_swap_map() uses a free cluster in the middle of free
+ * cluster list. Avoiding such abuse to avoid list corruption.
+ */
+static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si,
+	unsigned long offset)
+{
+	offset /= SWAPFILE_CLUSTER;
+	return !cluster_is_null(&si->free_cluster_head) &&
+		offset != cluster_next(&si->free_cluster_head) &&
+		cluster_is_free(&si->cluster_info[offset]);
+}
+
 static unsigned long scan_swap_map(struct swap_info_struct *si,
 				   unsigned char usage)
 {
@@ -225,6 +353,25 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 			si->lowest_alloc = si->max;
 			si->highest_alloc = 0;
 		}
+check_cluster:
+		if (!cluster_is_null(&si->free_cluster_head)) {
+			offset = cluster_next(&si->free_cluster_head) *
+						SWAPFILE_CLUSTER;
+			last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
+			si->cluster_next = offset;
+			si->cluster_nr = SWAPFILE_CLUSTER - 1;
+			found_free_cluster = 1;
+			goto checks;
+		} else if (si->cluster_info) {
+			/*
+			 * Checking free cluster is fast enough, we can do the
+			 * check every time
+			 */
+			si->cluster_nr = 0;
+			si->lowest_alloc = 0;
+			goto checks;
+		}
+
 		spin_unlock(&si->lock);
 
 		/*
@@ -285,6 +432,8 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 	}
 
 checks:
+	if (scan_swap_map_recheck_cluster(si, offset))
+		goto check_cluster;
 	if (!(si->flags & SWP_WRITEOK))
 		goto no_page;
 	if (!si->highest_bit)
@@ -317,6 +466,7 @@ checks:
 		si->highest_bit = 0;
 	}
 	si->swap_map[offset] = usage;
+	inc_cluster_info_page(si, si->cluster_info, offset);
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
 
@@ -600,6 +750,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
 
 	/* free if no reference */
 	if (!usage) {
+		dec_cluster_info_page(p, p->cluster_info, offset);
 		if (offset < p->lowest_bit)
 			p->lowest_bit = offset;
 		if (offset > p->highest_bit)
@@ -1524,7 +1675,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 }
 
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
-				unsigned char *swap_map)
+				unsigned char *swap_map,
+				struct swap_cluster_info *cluster_info)
 {
 	int i, prev;
 
@@ -1533,6 +1685,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	else
 		p->prio = --least_priority;
 	p->swap_map = swap_map;
+	p->cluster_info = cluster_info;
 	p->flags |= SWP_WRITEOK;
 	atomic_long_add(p->pages, &nr_swap_pages);
 	total_swap_pages += p->pages;
@@ -1553,12 +1706,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
 				unsigned char *swap_map,
+				struct swap_cluster_info *cluster_info,
 				unsigned long *frontswap_map)
 {
 	frontswap_init(p->type, frontswap_map);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
-	 _enable_swap_info(p, prio, swap_map);
+	 _enable_swap_info(p, prio, swap_map, cluster_info);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 }
@@ -1567,7 +1721,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 {
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
-	_enable_swap_info(p, p->prio, p->swap_map);
+	_enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 }
@@ -1576,6 +1730,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
 	unsigned char *swap_map;
+	struct swap_cluster_info *cluster_info;
 	unsigned long *frontswap_map;
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
@@ -1675,6 +1830,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	p->max = 0;
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
+	cluster_info = p->cluster_info;
+	p->cluster_info = NULL;
 	p->flags = 0;
 	frontswap_map = frontswap_map_get(p);
 	frontswap_map_set(p, NULL);
@@ -1683,6 +1840,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	frontswap_invalidate_area(type);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+	vfree(cluster_info);
 	vfree(frontswap_map);
 	/* Destroy swap account informatin */
 	swap_cgroup_swapoff(type);
@@ -2000,15 +2158,21 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
 static int setup_swap_map_and_extents(struct swap_info_struct *p,
 					union swap_header *swap_header,
 					unsigned char *swap_map,
+					struct swap_cluster_info *cluster_info,
 					unsigned long maxpages,
 					sector_t *span)
 {
 	int i;
 	unsigned int nr_good_pages;
 	int nr_extents;
+	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+	unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER;
 
 	nr_good_pages = maxpages - 1;	/* omit header page */
 
+	cluster_set_null(&p->free_cluster_head);
+	cluster_set_null(&p->free_cluster_tail);
+
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
 		if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -2016,11 +2180,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 		if (page_nr < maxpages) {
 			swap_map[page_nr] = SWAP_MAP_BAD;
 			nr_good_pages--;
+			/*
+			 * Haven't marked the cluster free yet, no list
+			 * operation involved
+			 */
+			inc_cluster_info_page(p, cluster_info, page_nr);
 		}
 	}
 
+	/* Haven't marked the cluster free yet, no list operation involved */
+	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+		inc_cluster_info_page(p, cluster_info, i);
+
 	if (nr_good_pages) {
 		swap_map[0] = SWAP_MAP_BAD;
+		/*
+		 * Not mark the cluster free yet, no list
+		 * operation involved
+		 */
+		inc_cluster_info_page(p, cluster_info, 0);
 		p->max = maxpages;
 		p->pages = nr_good_pages;
 		nr_extents = setup_swap_extents(p, span);
@@ -2033,6 +2211,30 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 		return -EINVAL;
 	}
 
+	if (!cluster_info)
+		return nr_extents;
+
+	for (i = 0; i < nr_clusters; i++) {
+		if (!cluster_count(&cluster_info[idx])) {
+			cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
+			if (cluster_is_null(&p->free_cluster_head)) {
+				cluster_set_next_flag(&p->free_cluster_head,
+								idx, 0);
+				cluster_set_next_flag(&p->free_cluster_tail,
+								idx, 0);
+			} else {
+				unsigned int tail;
+
+				tail = cluster_next(&p->free_cluster_tail);
+				cluster_set_next(&cluster_info[tail], idx);
+				cluster_set_next_flag(&p->free_cluster_tail,
+								idx, 0);
+			}
+		}
+		idx++;
+		if (idx == nr_clusters)
+			idx = 0;
+	}
 	return nr_extents;
 }
 
@@ -2064,6 +2266,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	sector_t span;
 	unsigned long maxpages;
 	unsigned char *swap_map = NULL;
+	struct swap_cluster_info *cluster_info = NULL;
 	unsigned long *frontswap_map = NULL;
 	struct page *page = NULL;
 	struct inode *inode = NULL;
@@ -2137,13 +2340,28 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 		error = -ENOMEM;
 		goto bad_swap;
 	}
+	if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+		p->flags |= SWP_SOLIDSTATE;
+		/*
+		 * select a random position to start with to help wear leveling
+		 * SSD
+		 */
+		p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+
+		cluster_info = vzalloc(DIV_ROUND_UP(maxpages,
+			SWAPFILE_CLUSTER) * sizeof(*cluster_info));
+		if (!cluster_info) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+	}
 
 	error = swap_cgroup_swapon(p->type, maxpages);
 	if (error)
 		goto bad_swap;
 
 	nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
-		maxpages, &span);
+		cluster_info, maxpages, &span);
 	if (unlikely(nr_extents < 0)) {
 		error = nr_extents;
 		goto bad_swap;
@@ -2152,40 +2370,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (frontswap_enabled)
 		frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long));
 
-	if (p->bdev) {
-		if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
-			p->flags |= SWP_SOLIDSTATE;
-			p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
-		}
+	if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+		/*
+		 * When discard is enabled for swap with no particular
+		 * policy flagged, we set all swap discard flags here in
+		 * order to sustain backward compatibility with older
+		 * swapon(8) releases.
+		 */
+		p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+			     SWP_PAGE_DISCARD);
 
-		if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
-			/*
-			 * When discard is enabled for swap with no particular
-			 * policy flagged, we set all swap discard flags here in
-			 * order to sustain backward compatibility with older
-			 * swapon(8) releases.
-			 */
-			p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
-				     SWP_PAGE_DISCARD);
+		/*
+		 * By flagging sys_swapon, a sysadmin can tell us to
+		 * either do single-time area discards only, or to just
+		 * perform discards for released swap page-clusters.
+		 * Now it's time to adjust the p->flags accordingly.
+		 */
+		if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
+			p->flags &= ~SWP_PAGE_DISCARD;
+		else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
+			p->flags &= ~SWP_AREA_DISCARD;
 
-			/*
-			 * By flagging sys_swapon, a sysadmin can tell us to
-			 * either do single-time area discards only, or to just
-			 * perform discards for released swap page-clusters.
-			 * Now it's time to adjust the p->flags accordingly.
-			 */
-			if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
-				p->flags &= ~SWP_PAGE_DISCARD;
-			else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
-				p->flags &= ~SWP_AREA_DISCARD;
-
-			/* issue a swapon-time discard if it's still required */
-			if (p->flags & SWP_AREA_DISCARD) {
-				int err = discard_swap(p);
-				if (unlikely(err))
-					pr_err("swapon: discard_swap(%p): %d\n",
-						p, err);
-			}
+		/* issue a swapon-time discard if it's still required */
+		if (p->flags & SWP_AREA_DISCARD) {
+			int err = discard_swap(p);
+			if (unlikely(err))
+				pr_err("swapon: discard_swap(%p): %d\n",
+					p, err);
 		}
 	}
 
@@ -2194,7 +2405,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (swap_flags & SWAP_FLAG_PREFER)
 		prio =
 		  (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
-	enable_swap_info(p, prio, swap_map, frontswap_map);
+	enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
 
 	pr_info("Adding %uk swap on %s.  "
 			"Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
@@ -2226,6 +2437,7 @@ bad_swap:
 	p->flags = 0;
 	spin_unlock(&swap_lock);
 	vfree(swap_map);
+	vfree(cluster_info);
 	if (swap_file) {
 		if (inode && S_ISREG(inode->i_mode)) {
 			mutex_unlock(&inode->i_mutex);

From 815c2c543d3aeb914a361f981440ece552778724 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 11 Sep 2013 14:20:30 -0700
Subject: [PATCH 047/303] swap: make swap discard async

swap can do cluster discard for SSD, which is good, but there are some
problems here:

1. swap do the discard just before page reclaim gets a swap entry and
   writes the disk sectors.  This is useless for high end SSD, because an
   overwrite to a sector implies a discard to original sector too.  A
   discard + overwrite == overwrite.

2. the purpose of doing discard is to improve SSD firmware garbage
   collection.  Idealy we should send discard as early as possible, so
   firmware can do something smart.  Sending discard just after swap entry
   is freed is considered early compared to sending discard before write.
   Of course, if workload is already bound to gc speed, sending discard
   earlier or later doesn't make

3. block discard is a sync API, which will delay scan_swap_map()
   significantly.

4. Write and discard command can be executed parallel in PCIe SSD.
   Making swap discard async can make execution more efficiently.

This patch makes swap discard async and moves discard to where swap entry
is freed.  Discard and write have no dependence now, so above issues can
be avoided.  Idealy we should do discard for any freed sectors, but some
SSD discard is very slow.  This patch still does discard for a whole
cluster.

My test does a several round of 'mmap, write, unmap', which will trigger a
lot of swap discard.  In a fusionio card, with this patch, the test
runtime is reduced to 18% of the time without it, so around 5.5x faster.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  20 +++--
 mm/swapfile.c        | 192 +++++++++++++++++++++++++------------------
 2 files changed, 125 insertions(+), 87 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index cb5baebf31d6..8a3c4a1caa14 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -217,8 +217,6 @@ struct swap_info_struct {
 	unsigned int inuse_pages;	/* number of those currently in use */
 	unsigned int cluster_next;	/* likely index for next allocation */
 	unsigned int cluster_nr;	/* countdown to next cluster search */
-	unsigned int lowest_alloc;	/* while preparing discard cluster */
-	unsigned int highest_alloc;	/* while preparing discard cluster */
 	struct swap_extent *curr_swap_extent;
 	struct swap_extent first_swap_extent;
 	struct block_device *bdev;	/* swap device or bdev of swap file */
@@ -232,14 +230,18 @@ struct swap_info_struct {
 					 * protect map scan related fields like
 					 * swap_map, lowest_bit, highest_bit,
 					 * inuse_pages, cluster_next,
-					 * cluster_nr, lowest_alloc and
-					 * highest_alloc. other fields are only
-					 * changed at swapon/swapoff, so are
-					 * protected by swap_lock. changing
-					 * flags need hold this lock and
-					 * swap_lock. If both locks need hold,
-					 * hold swap_lock first.
+					 * cluster_nr, lowest_alloc,
+					 * highest_alloc, free/discard cluster
+					 * list. other fields are only changed
+					 * at swapon/swapoff, so are protected
+					 * by swap_lock. changing flags need
+					 * hold this lock and swap_lock. If
+					 * both locks need hold, hold swap_lock
+					 * first.
 					 */
+	struct work_struct discard_work; /* discard worker */
+	struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */
+	struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */
 };
 
 struct swap_list_t {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d1fbeb486de5..dac47c66055c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -175,12 +175,6 @@ static void discard_swap_cluster(struct swap_info_struct *si,
 	}
 }
 
-static int wait_for_discard(void *word)
-{
-	schedule();
-	return 0;
-}
-
 #define SWAPFILE_CLUSTER	256
 #define LATENCY_LIMIT		256
 
@@ -242,6 +236,90 @@ static inline void cluster_set_null(struct swap_cluster_info *info)
 	info->data = 0;
 }
 
+/* Add a cluster to discard list and schedule it to do discard */
+static void swap_cluster_schedule_discard(struct swap_info_struct *si,
+		unsigned int idx)
+{
+	/*
+	 * If scan_swap_map() can't find a free cluster, it will check
+	 * si->swap_map directly. To make sure the discarding cluster isn't
+	 * taken by scan_swap_map(), mark the swap entries bad (occupied). It
+	 * will be cleared after discard
+	 */
+	memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+			SWAP_MAP_BAD, SWAPFILE_CLUSTER);
+
+	if (cluster_is_null(&si->discard_cluster_head)) {
+		cluster_set_next_flag(&si->discard_cluster_head,
+						idx, 0);
+		cluster_set_next_flag(&si->discard_cluster_tail,
+						idx, 0);
+	} else {
+		unsigned int tail = cluster_next(&si->discard_cluster_tail);
+		cluster_set_next(&si->cluster_info[tail], idx);
+		cluster_set_next_flag(&si->discard_cluster_tail,
+						idx, 0);
+	}
+
+	schedule_work(&si->discard_work);
+}
+
+/*
+ * Doing discard actually. After a cluster discard is finished, the cluster
+ * will be added to free cluster list. caller should hold si->lock.
+*/
+static void swap_do_scheduled_discard(struct swap_info_struct *si)
+{
+	struct swap_cluster_info *info;
+	unsigned int idx;
+
+	info = si->cluster_info;
+
+	while (!cluster_is_null(&si->discard_cluster_head)) {
+		idx = cluster_next(&si->discard_cluster_head);
+
+		cluster_set_next_flag(&si->discard_cluster_head,
+						cluster_next(&info[idx]), 0);
+		if (cluster_next(&si->discard_cluster_tail) == idx) {
+			cluster_set_null(&si->discard_cluster_head);
+			cluster_set_null(&si->discard_cluster_tail);
+		}
+		spin_unlock(&si->lock);
+
+		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
+				SWAPFILE_CLUSTER);
+
+		spin_lock(&si->lock);
+		cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE);
+		if (cluster_is_null(&si->free_cluster_head)) {
+			cluster_set_next_flag(&si->free_cluster_head,
+						idx, 0);
+			cluster_set_next_flag(&si->free_cluster_tail,
+						idx, 0);
+		} else {
+			unsigned int tail;
+
+			tail = cluster_next(&si->free_cluster_tail);
+			cluster_set_next(&info[tail], idx);
+			cluster_set_next_flag(&si->free_cluster_tail,
+						idx, 0);
+		}
+		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
+				0, SWAPFILE_CLUSTER);
+	}
+}
+
+static void swap_discard_work(struct work_struct *work)
+{
+	struct swap_info_struct *si;
+
+	si = container_of(work, struct swap_info_struct, discard_work);
+
+	spin_lock(&si->lock);
+	swap_do_scheduled_discard(si);
+	spin_unlock(&si->lock);
+}
+
 /*
  * The cluster corresponding to page_nr will be used. The cluster will be
  * removed from free cluster list and its usage counter will be increased.
@@ -287,6 +365,16 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 		cluster_count(&cluster_info[idx]) - 1);
 
 	if (cluster_count(&cluster_info[idx]) == 0) {
+		/*
+		 * If the swap is discardable, prepare discard the cluster
+		 * instead of free it immediately. The cluster will be freed
+		 * after discard.
+		 */
+		if (p->flags & SWP_PAGE_DISCARD) {
+			swap_cluster_schedule_discard(p, idx);
+			return;
+		}
+
 		cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
 		if (cluster_is_null(&p->free_cluster_head)) {
 			cluster_set_next_flag(&p->free_cluster_head, idx, 0);
@@ -319,7 +407,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 	unsigned long scan_base;
 	unsigned long last_in_cluster = 0;
 	int latency_ration = LATENCY_LIMIT;
-	int found_free_cluster = 0;
 
 	/*
 	 * We try to cluster swap pages by allocating them sequentially
@@ -340,19 +427,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
 			goto checks;
 		}
-		if (si->flags & SWP_PAGE_DISCARD) {
-			/*
-			 * Start range check on racing allocations, in case
-			 * they overlap the cluster we eventually decide on
-			 * (we scan without swap_lock to allow preemption).
-			 * It's hardly conceivable that cluster_nr could be
-			 * wrapped during our scan, but don't depend on it.
-			 */
-			if (si->lowest_alloc)
-				goto checks;
-			si->lowest_alloc = si->max;
-			si->highest_alloc = 0;
-		}
 check_cluster:
 		if (!cluster_is_null(&si->free_cluster_head)) {
 			offset = cluster_next(&si->free_cluster_head) *
@@ -360,15 +434,27 @@ check_cluster:
 			last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
 			si->cluster_next = offset;
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
-			found_free_cluster = 1;
 			goto checks;
 		} else if (si->cluster_info) {
+			/*
+			 * we don't have free cluster but have some clusters in
+			 * discarding, do discard now and reclaim them
+			 */
+			if (!cluster_is_null(&si->discard_cluster_head)) {
+				si->cluster_nr = 0;
+				swap_do_scheduled_discard(si);
+				scan_base = offset = si->cluster_next;
+				if (!si->cluster_nr)
+					goto check_cluster;
+				si->cluster_nr--;
+				goto checks;
+			}
+
 			/*
 			 * Checking free cluster is fast enough, we can do the
 			 * check every time
 			 */
 			si->cluster_nr = 0;
-			si->lowest_alloc = 0;
 			goto checks;
 		}
 
@@ -395,7 +481,6 @@ check_cluster:
 				offset -= SWAPFILE_CLUSTER - 1;
 				si->cluster_next = offset;
 				si->cluster_nr = SWAPFILE_CLUSTER - 1;
-				found_free_cluster = 1;
 				goto checks;
 			}
 			if (unlikely(--latency_ration < 0)) {
@@ -416,7 +501,6 @@ check_cluster:
 				offset -= SWAPFILE_CLUSTER - 1;
 				si->cluster_next = offset;
 				si->cluster_nr = SWAPFILE_CLUSTER - 1;
-				found_free_cluster = 1;
 				goto checks;
 			}
 			if (unlikely(--latency_ration < 0)) {
@@ -428,7 +512,6 @@ check_cluster:
 		offset = scan_base;
 		spin_lock(&si->lock);
 		si->cluster_nr = SWAPFILE_CLUSTER - 1;
-		si->lowest_alloc = 0;
 	}
 
 checks:
@@ -470,59 +553,6 @@ checks:
 	si->cluster_next = offset + 1;
 	si->flags -= SWP_SCANNING;
 
-	if (si->lowest_alloc) {
-		/*
-		 * Only set when SWP_PAGE_DISCARD, and there's a scan
-		 * for a free cluster in progress or just completed.
-		 */
-		if (found_free_cluster) {
-			/*
-			 * To optimize wear-levelling, discard the
-			 * old data of the cluster, taking care not to
-			 * discard any of its pages that have already
-			 * been allocated by racing tasks (offset has
-			 * already stepped over any at the beginning).
-			 */
-			if (offset < si->highest_alloc &&
-			    si->lowest_alloc <= last_in_cluster)
-				last_in_cluster = si->lowest_alloc - 1;
-			si->flags |= SWP_DISCARDING;
-			spin_unlock(&si->lock);
-
-			if (offset < last_in_cluster)
-				discard_swap_cluster(si, offset,
-					last_in_cluster - offset + 1);
-
-			spin_lock(&si->lock);
-			si->lowest_alloc = 0;
-			si->flags &= ~SWP_DISCARDING;
-
-			smp_mb();	/* wake_up_bit advises this */
-			wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
-
-		} else if (si->flags & SWP_DISCARDING) {
-			/*
-			 * Delay using pages allocated by racing tasks
-			 * until the whole discard has been issued. We
-			 * could defer that delay until swap_writepage,
-			 * but it's easier to keep this self-contained.
-			 */
-			spin_unlock(&si->lock);
-			wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
-				wait_for_discard, TASK_UNINTERRUPTIBLE);
-			spin_lock(&si->lock);
-		} else {
-			/*
-			 * Note pages allocated by racing tasks while
-			 * scan for a free cluster is in progress, so
-			 * that its final discard can exclude them.
-			 */
-			if (offset < si->lowest_alloc)
-				si->lowest_alloc = offset;
-			if (offset > si->highest_alloc)
-				si->highest_alloc = offset;
-		}
-	}
 	return offset;
 
 scan:
@@ -1806,6 +1836,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 		goto out_dput;
 	}
 
+	flush_work(&p->discard_work);
+
 	destroy_swap_extents(p);
 	if (p->flags & SWP_CONTINUED)
 		free_swap_count_continuations(p);
@@ -2172,6 +2204,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
 
 	cluster_set_null(&p->free_cluster_head);
 	cluster_set_null(&p->free_cluster_tail);
+	cluster_set_null(&p->discard_cluster_head);
+	cluster_set_null(&p->discard_cluster_tail);
 
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
@@ -2281,6 +2315,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	if (IS_ERR(p))
 		return PTR_ERR(p);
 
+	INIT_WORK(&p->discard_work, swap_discard_work);
+
 	name = getname(specialfile);
 	if (IS_ERR(name)) {
 		error = PTR_ERR(name);

From edfe23dac3e2981277087b05bec7fec7790d1835 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 11 Sep 2013 14:20:31 -0700
Subject: [PATCH 048/303] swap: fix races exposed by swap discard

The previous patch can expose races, according to Hugh:

swapoff was sometimes failing with "Cannot allocate memory", coming from
try_to_unuse()'s -ENOMEM: it needs to allow for swap_duplicate() failing
on a free entry temporarily SWAP_MAP_BAD while being discarded.

We should use ACCESS_ONCE() there, and whenever accessing swap_map
locklessly; but rather than peppering it throughout try_to_unuse(), just
declare *swap_map with volatile.

try_to_unuse() is accustomed to *swap_map going down racily, but not
necessarily to it jumping up from 0 to SWAP_MAP_BAD: we'll be safer to
prevent that transition once SWP_WRITEOK is switched off, when it's a
waste of time to issue discards anyway (swapon can do a whole discard).

Another issue is:

In swapin_readahead(), read_swap_cache_async() can read a bad swap entry,
because we don't check if readahead swap entry is bad.  This doesn't break
anything but such swapin page is wasteful and can only be freed at page
reclaim.  We should avoid read such swap entry.  And in discard, we mark
swap entry SWAP_MAP_BAD and then switch it to normal when discard is
finished.  If readahead reads such swap entry, we have the same issue, so
we much check if swap entry is bad too.

Thanks Hugh to inspire swapin_readahead could use bad swap entry.

[include Hugh's patch 'swap: fix swapoff ENOMEMs from discard']
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swapfile.c | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/mm/swapfile.c b/mm/swapfile.c
index dac47c66055c..98e52e373bd8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -370,7 +370,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
 		 * instead of free it immediately. The cluster will be freed
 		 * after discard.
 		 */
-		if (p->flags & SWP_PAGE_DISCARD) {
+		if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
+				 (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
 			swap_cluster_schedule_discard(p, idx);
 			return;
 		}
@@ -1288,7 +1289,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
 			else
 				continue;
 		}
-		count = si->swap_map[i];
+		count = ACCESS_ONCE(si->swap_map[i]);
 		if (count && swap_count(count) != SWAP_MAP_BAD)
 			break;
 	}
@@ -1308,7 +1309,11 @@ int try_to_unuse(unsigned int type, bool frontswap,
 {
 	struct swap_info_struct *si = swap_info[type];
 	struct mm_struct *start_mm;
-	unsigned char *swap_map;
+	volatile unsigned char *swap_map; /* swap_map is accessed without
+					   * locking. Mark it as volatile
+					   * to prevent compiler doing
+					   * something odd.
+					   */
 	unsigned char swcount;
 	struct page *page;
 	swp_entry_t entry;
@@ -1359,7 +1364,15 @@ int try_to_unuse(unsigned int type, bool frontswap,
 			 * reused since sys_swapoff() already disabled
 			 * allocation from here, or alloc_page() failed.
 			 */
-			if (!*swap_map)
+			swcount = *swap_map;
+			/*
+			 * We don't hold lock here, so the swap entry could be
+			 * SWAP_MAP_BAD (when the cluster is discarding).
+			 * Instead of fail out, We can just skip the swap
+			 * entry because swapoff will wait for discarding
+			 * finish anyway.
+			 */
+			if (!swcount || swcount == SWAP_MAP_BAD)
 				continue;
 			retval = -ENOMEM;
 			break;
@@ -2543,6 +2556,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
 		goto unlock_out;
 
 	count = p->swap_map[offset];
+
+	/*
+	 * swapin_readahead() doesn't check if a swap entry is valid, so the
+	 * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+	 */
+	if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+		err = -ENOENT;
+		goto unlock_out;
+	}
+
 	has_cache = count & SWAP_HAS_CACHE;
 	count &= ~SWAP_HAS_CACHE;
 	err = 0;

From ebc2a1a69111eadfeda8487e577f1a5d42ef0dae Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 11 Sep 2013 14:20:32 -0700
Subject: [PATCH 049/303] swap: make cluster allocation per-cpu

swap cluster allocation is to get better request merge to improve
performance.  But the cluster is shared globally, if multiple tasks are
doing swap, this will cause interleave disk access.  While multiple tasks
swap is quite common, for example, each numa node has a kswapd thread
doing swap and multiple threads/processes doing direct page reclaim.

ioscheduler can't help too much here, because tasks don't send swapout IO
down to block layer in the meantime.  Block layer does merge some IOs, but
a lot not, depending on how many tasks are doing swapout concurrently.  In
practice, I've seen a lot of small size IO in swapout workloads.

We makes the cluster allocation per-cpu here.  The interleave disk access
issue goes away.  All tasks swapout to their own cluster, so swapout will
become sequential, which can be easily merged to big size IO.  If one CPU
can't get its per-cpu cluster (for example, there is no free cluster
anymore in the swap), it will fallback to scan swap_map.  The CPU can
still continue swap.  We don't need recycle free swap entries of other
CPUs.

In my test (swap to a 2-disk raid0 partition), this improves around 10%
swapout throughput, and request size is increased significantly.

How does this impact swap readahead is uncertain though.  On one side,
page reclaim always isolates and swaps several adjancent pages, this will
make page reclaim write the pages sequentially and benefit readahead.  On
the other side, several CPU write pages interleave means the pages don't
live _sequentially_ but relatively _near_.  In the per-cpu allocation
case, if adjancent pages are written by different cpus, they will live
relatively _far_.  So how this impacts swap readahead depends on how many
pages page reclaim isolates and swaps one time.  If the number is big,
this patch will benefit swap readahead.  Of course, this is about
sequential access pattern.  The patch has no impact for random access
pattern, because the new cluster allocation algorithm is just for SSD.

Alternative solution is organizing swap layout to be per-mm instead of
this per-cpu approach.  In the per-mm layout, we allocate a disk range for
each mm, so pages of one mm live in swap disk adjacently.  per-mm layout
has potential issues of lock contention if multiple reclaimers are swap
pages from one mm.  For a sequential workload, per-mm layout is better to
implement swap readahead, because pages from the mm are adjacent in disk.
But per-cpu layout isn't very bad in this workload, as page reclaim always
isolates and swaps several pages one time, such pages will still live in
disk sequentially and readahead can utilize this.  For a random workload,
per-mm layout isn't beneficial of request merge, because it's quite
possible pages from different mm are swapout in the meantime and IO can't
be merged in per-mm layout.  while with per-cpu layout we can merge
requests from any mm.  Considering random workload is more popular in
workloads with swap (and per-cpu approach isn't too bad for sequential
workload too), I'm choosing per-cpu layout.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Kyungmin Park <kmpark@infradead.org>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rafael Aquini <aquini@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  11 ++++
 mm/swapfile.c        | 125 +++++++++++++++++++++++++++++++------------
 2 files changed, 102 insertions(+), 34 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 8a3c4a1caa14..24db9142e93b 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -198,6 +198,16 @@ struct swap_cluster_info {
 #define CLUSTER_FLAG_FREE 1 /* This cluster is free */
 #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */
 
+/*
+ * We assign a cluster to each CPU, so each CPU can allocate swap entry from
+ * its own cluster and swapout sequentially. The purpose is to optimize swapout
+ * throughput.
+ */
+struct percpu_cluster {
+	struct swap_cluster_info index; /* Current cluster index */
+	unsigned int next; /* Likely next allocation offset */
+};
+
 /*
  * The in-memory structure used to track swap areas.
  */
@@ -217,6 +227,7 @@ struct swap_info_struct {
 	unsigned int inuse_pages;	/* number of those currently in use */
 	unsigned int cluster_next;	/* likely index for next allocation */
 	unsigned int cluster_nr;	/* countdown to next cluster search */
+	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
 	struct swap_extent *curr_swap_extent;
 	struct swap_extent first_swap_extent;
 	struct block_device *bdev;	/* swap device or bdev of swap file */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 98e52e373bd8..3963fc24fcc1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
  * It's possible scan_swap_map() uses a free cluster in the middle of free
  * cluster list. Avoiding such abuse to avoid list corruption.
  */
-static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si,
+static bool
+scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
 	unsigned long offset)
 {
+	struct percpu_cluster *percpu_cluster;
+	bool conflict;
+
 	offset /= SWAPFILE_CLUSTER;
-	return !cluster_is_null(&si->free_cluster_head) &&
+	conflict = !cluster_is_null(&si->free_cluster_head) &&
 		offset != cluster_next(&si->free_cluster_head) &&
 		cluster_is_free(&si->cluster_info[offset]);
+
+	if (!conflict)
+		return false;
+
+	percpu_cluster = this_cpu_ptr(si->percpu_cluster);
+	cluster_set_null(&percpu_cluster->index);
+	return true;
+}
+
+/*
+ * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
+ * might involve allocating a new cluster for current CPU too.
+ */
+static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
+	unsigned long *offset, unsigned long *scan_base)
+{
+	struct percpu_cluster *cluster;
+	bool found_free;
+	unsigned long tmp;
+
+new_cluster:
+	cluster = this_cpu_ptr(si->percpu_cluster);
+	if (cluster_is_null(&cluster->index)) {
+		if (!cluster_is_null(&si->free_cluster_head)) {
+			cluster->index = si->free_cluster_head;
+			cluster->next = cluster_next(&cluster->index) *
+					SWAPFILE_CLUSTER;
+		} else if (!cluster_is_null(&si->discard_cluster_head)) {
+			/*
+			 * we don't have free cluster but have some clusters in
+			 * discarding, do discard now and reclaim them
+			 */
+			swap_do_scheduled_discard(si);
+			*scan_base = *offset = si->cluster_next;
+			goto new_cluster;
+		} else
+			return;
+	}
+
+	found_free = false;
+
+	/*
+	 * Other CPUs can use our cluster if they can't find a free cluster,
+	 * check if there is still free entry in the cluster
+	 */
+	tmp = cluster->next;
+	while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) *
+	       SWAPFILE_CLUSTER) {
+		if (!si->swap_map[tmp]) {
+			found_free = true;
+			break;
+		}
+		tmp++;
+	}
+	if (!found_free) {
+		cluster_set_null(&cluster->index);
+		goto new_cluster;
+	}
+	cluster->next = tmp + 1;
+	*offset = tmp;
+	*scan_base = tmp;
 }
 
 static unsigned long scan_swap_map(struct swap_info_struct *si,
@@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si,
 	si->flags += SWP_SCANNING;
 	scan_base = offset = si->cluster_next;
 
+	/* SSD algorithm */
+	if (si->cluster_info) {
+		scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+		goto checks;
+	}
+
 	if (unlikely(!si->cluster_nr--)) {
 		if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
 			si->cluster_nr = SWAPFILE_CLUSTER - 1;
 			goto checks;
 		}
-check_cluster:
-		if (!cluster_is_null(&si->free_cluster_head)) {
-			offset = cluster_next(&si->free_cluster_head) *
-						SWAPFILE_CLUSTER;
-			last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
-			si->cluster_next = offset;
-			si->cluster_nr = SWAPFILE_CLUSTER - 1;
-			goto checks;
-		} else if (si->cluster_info) {
-			/*
-			 * we don't have free cluster but have some clusters in
-			 * discarding, do discard now and reclaim them
-			 */
-			if (!cluster_is_null(&si->discard_cluster_head)) {
-				si->cluster_nr = 0;
-				swap_do_scheduled_discard(si);
-				scan_base = offset = si->cluster_next;
-				if (!si->cluster_nr)
-					goto check_cluster;
-				si->cluster_nr--;
-				goto checks;
-			}
-
-			/*
-			 * Checking free cluster is fast enough, we can do the
-			 * check every time
-			 */
-			si->cluster_nr = 0;
-			goto checks;
-		}
 
 		spin_unlock(&si->lock);
 
@@ -516,8 +557,10 @@ check_cluster:
 	}
 
 checks:
-	if (scan_swap_map_recheck_cluster(si, offset))
-		goto check_cluster;
+	if (si->cluster_info) {
+		while (scan_swap_map_ssd_cluster_conflict(si, offset))
+			scan_swap_map_try_ssd_cluster(si, &offset, &scan_base);
+	}
 	if (!(si->flags & SWP_WRITEOK))
 		goto no_page;
 	if (!si->highest_bit)
@@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	spin_unlock(&swap_lock);
 	frontswap_invalidate_area(type);
 	mutex_unlock(&swapon_mutex);
+	free_percpu(p->percpu_cluster);
+	p->percpu_cluster = NULL;
 	vfree(swap_map);
 	vfree(cluster_info);
 	vfree(frontswap_map);
@@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 			error = -ENOMEM;
 			goto bad_swap;
 		}
+		p->percpu_cluster = alloc_percpu(struct percpu_cluster);
+		if (!p->percpu_cluster) {
+			error = -ENOMEM;
+			goto bad_swap;
+		}
+		for_each_possible_cpu(i) {
+			struct percpu_cluster *cluster;
+			cluster = per_cpu_ptr(p->percpu_cluster, i);
+			cluster_set_null(&cluster->index);
+		}
 	}
 
 	error = swap_cgroup_swapon(p->type, maxpages);
@@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
 	error = 0;
 	goto out;
 bad_swap:
+	free_percpu(p->percpu_cluster);
+	p->percpu_cluster = NULL;
 	if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
 		set_blocksize(p->bdev, p->old_block_size);
 		blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);

From b8af29418a0269d2c12f563add54a95cc19471fb Mon Sep 17 00:00:00 2001
From: Pintu Kumar <pintu.k@samsung.com>
Date: Wed, 11 Sep 2013 14:20:34 -0700
Subject: [PATCH 050/303] mm/page_alloc.c: fix coding style and spelling

Fix all errors reported by checkpatch and some small spelling mistakes.

Signed-off-by: Pintu Kumar <pintu.k@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cf157637df3..2ca3e9bd739c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -721,7 +721,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 		return false;
 
 	if (!PageHighMem(page)) {
-		debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
 					   PAGE_SIZE << order);
 	}
@@ -885,7 +886,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 						int migratetype)
 {
 	unsigned int current_order;
-	struct free_area * area;
+	struct free_area *area;
 	struct page *page;
 
 	/* Find a page of the appropriate size in the preferred list */
@@ -1011,7 +1012,7 @@ static void change_pageblock_range(struct page *pageblock_page,
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 {
-	struct free_area * area;
+	struct free_area *area;
 	int current_order;
 	struct page *page;
 	int migratetype, i;
@@ -3104,7 +3105,7 @@ void show_free_areas(unsigned int filter)
 	}
 
 	for_each_populated_zone(zone) {
- 		unsigned long nr[MAX_ORDER], flags, order, total = 0;
+		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 		unsigned char types[MAX_ORDER];
 
 		if (skip_free_areas_node(filter, zone_to_nid(zone)))
@@ -3416,11 +3417,11 @@ static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
 static int default_zonelist_order(void)
 {
 	int nid, zone_type;
-	unsigned long low_kmem_size,total_size;
+	unsigned long low_kmem_size, total_size;
 	struct zone *z;
 	int average_size;
 	/*
-         * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
+	 * ZONE_DMA and ZONE_DMA32 can be very small area in the system.
 	 * If they are really small and used heavily, the system can fall
 	 * into OOM very easily.
 	 * This function detect ZONE_DMA/DMA32 size and configures zone order.
@@ -3452,9 +3453,9 @@ static int default_zonelist_order(void)
 		return ZONELIST_ORDER_NODE;
 	/*
 	 * look into each node's config.
-  	 * If there is a node whose DMA/DMA32 memory is very big area on
- 	 * local memory, NODE_ORDER may be suitable.
-         */
+	 * If there is a node whose DMA/DMA32 memory is very big area on
+	 * local memory, NODE_ORDER may be suitable.
+	 */
 	average_size = total_size /
 				(nodes_weight(node_states[N_MEMORY]) + 1);
 	for_each_online_node(nid) {
@@ -4180,7 +4181,7 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
 	if (!zone->wait_table)
 		return -ENOMEM;
 
-	for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
+	for (i = 0; i < zone->wait_table_hash_nr_entries; ++i)
 		init_waitqueue_head(zone->wait_table + i);
 
 	return 0;
@@ -4930,7 +4931,7 @@ static unsigned long __init early_calculate_totalpages(void)
 		if (pages)
 			node_set_state(nid, N_MEMORY);
 	}
-  	return totalpages;
+	return totalpages;
 }
 
 /*
@@ -5047,7 +5048,7 @@ restart:
 			/*
 			 * Some kernelcore has been met, update counts and
 			 * break if the kernelcore for this node has been
-			 * satisified
+			 * satisfied
 			 */
 			required_kernelcore -= min(required_kernelcore,
 								size_pages);
@@ -5061,7 +5062,7 @@ restart:
 	 * If there is still required_kernelcore, we do another pass with one
 	 * less node in the count. This will push zone_movable_pfn[nid] further
 	 * along on the nodes that still have memory until kernelcore is
-	 * satisified
+	 * satisfied
 	 */
 	usable_nodes--;
 	if (usable_nodes && required_kernelcore > usable_nodes)
@@ -5286,8 +5287,10 @@ void __init mem_init_print_info(const char *str)
 	 * 3) .rodata.* may be embedded into .text or .data sections.
 	 */
 #define adj_init_size(start, end, size, pos, adj) \
-	if (start <= pos && pos < end && size > adj) \
-		size -= adj;
+	do { \
+		if (start <= pos && pos < end && size > adj) \
+			size -= adj; \
+	} while (0)
 
 	adj_init_size(__init_begin, __init_end, init_data_size,
 		     _sinittext, init_code_size);
@@ -5570,7 +5573,7 @@ static void __meminit setup_per_zone_inactive_ratio(void)
  * we want it large (64MB max).  But it is not linear, because network
  * bandwidth does not increase linearly with machine size.  We use
  *
- * 	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
+ *	min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
  *	min_free_kbytes = sqrt(lowmem_kbytes * 16)
  *
  * which yields
@@ -5614,11 +5617,11 @@ int __meminit init_per_zone_wmark_min(void)
 module_init(init_per_zone_wmark_min)
 
 /*
- * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
+ * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
  *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
-int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
+int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
 	proc_dointvec(table, write, buffer, length, ppos);
@@ -5682,8 +5685,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
 
 /*
  * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu.  It is the fraction of total pages in each zone that a hot per cpu pagelist
- * can have before it gets flushed back to buddy allocator.
+ * cpu.  It is the fraction of total pages in each zone that a hot per cpu
+ * pagelist can have before it gets flushed back to buddy allocator.
  */
 int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
@@ -5901,7 +5904,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
  *
- * PageLRU check wihtout isolation or lru_lock could race so that
+ * PageLRU check without isolation or lru_lock could race so that
  * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
  * expect this function should be exact.
  */

From fef903efcf0cb9721f3f2da719daec9bbc26f12b Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:20:35 -0700
Subject: [PATCH 051/303] mm/page_allo.c: restructure free-page stealing code
 and fix a bug

The free-page stealing code in __rmqueue_fallback() is somewhat hard to
follow, and has an incredible amount of subtlety hidden inside!

First off, there is a minor bug in the reporting of change-of-ownership of
pageblocks.  Under some conditions, we try to move upto
'pageblock_nr_pages' no.  of pages to the preferred allocation list.  But
we change the ownership of that pageblock to the preferred type only if we
manage to successfully move atleast half of that pageblock (or if
page_group_by_mobility_disabled is set).

However, the current code ignores the latter part and sets the
'migratetype' variable to the preferred type, irrespective of whether we
actually changed the pageblock migratetype of that block or not.  So, the
page_alloc_extfrag tracepoint can end up printing incorrect info (i.e.,
'change_ownership' might be shown as 1 when it must have been 0).

So fixing this involves moving the update of the 'migratetype' variable to
the right place.  But looking closer, we observe that the 'migratetype'
variable is used subsequently for checks such as "is_migrate_cma()".
Obviously the intent there is to check if the *fallback* type is
MIGRATE_CMA, but since we already set the 'migratetype' variable to
start_migratetype, we end up checking if the *preferred* type is
MIGRATE_CMA!!

To make things more interesting, this actually doesn't cause a bug in
practice, because we never change *anything* if the fallback type is CMA.

So, restructure the code in such a way that it is trivial to understand
what is going on, and also fix the above mentioned bug.  And while at it,
also add a comment explaining the subtlety behind the migratetype used in
the call to expand().

[akpm@linux-foundation.org: remove unneeded `inline', small coding-style fix]
Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 95 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 59 insertions(+), 36 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2ca3e9bd739c..b09ce5fe0cd2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1008,6 +1008,52 @@ static void change_pageblock_range(struct page *pageblock_page,
 	}
 }
 
+/*
+ * If breaking a large block of pages, move all free pages to the preferred
+ * allocation list. If falling back for a reclaimable kernel allocation, be
+ * more aggressive about taking ownership of free pages.
+ *
+ * On the other hand, never change migration type of MIGRATE_CMA pageblocks
+ * nor move CMA pages to different free lists. We don't want unmovable pages
+ * to be allocated from MIGRATE_CMA areas.
+ *
+ * Returns the new migratetype of the pageblock (or the same old migratetype
+ * if it was unchanged).
+ */
+static int try_to_steal_freepages(struct zone *zone, struct page *page,
+				  int start_type, int fallback_type)
+{
+	int current_order = page_order(page);
+
+	if (is_migrate_cma(fallback_type))
+		return fallback_type;
+
+	/* Take ownership for orders >= pageblock_order */
+	if (current_order >= pageblock_order) {
+		change_pageblock_range(page, current_order, start_type);
+		return start_type;
+	}
+
+	if (current_order >= pageblock_order / 2 ||
+	    start_type == MIGRATE_RECLAIMABLE ||
+	    page_group_by_mobility_disabled) {
+		int pages;
+
+		pages = move_freepages_block(zone, page, start_type);
+
+		/* Claim the whole block if over half of it is free */
+		if (pages >= (1 << (pageblock_order-1)) ||
+				page_group_by_mobility_disabled) {
+
+			set_pageblock_migratetype(page, start_type);
+			return start_type;
+		}
+
+	}
+
+	return fallback_type;
+}
+
 /* Remove an element from the buddy allocator from the fallback list */
 static inline struct page *
 __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
@@ -1015,7 +1061,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 	struct free_area *area;
 	int current_order;
 	struct page *page;
-	int migratetype, i;
+	int migratetype, new_type, i;
 
 	/* Find the largest possible block of pages in the other list */
 	for (current_order = MAX_ORDER-1; current_order >= order;
@@ -1035,51 +1081,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 					struct page, lru);
 			area->nr_free--;
 
-			/*
-			 * If breaking a large block of pages, move all free
-			 * pages to the preferred allocation list. If falling
-			 * back for a reclaimable kernel allocation, be more
-			 * aggressive about taking ownership of free pages
-			 *
-			 * On the other hand, never change migration
-			 * type of MIGRATE_CMA pageblocks nor move CMA
-			 * pages on different free lists. We don't
-			 * want unmovable pages to be allocated from
-			 * MIGRATE_CMA areas.
-			 */
-			if (!is_migrate_cma(migratetype) &&
-			    (current_order >= pageblock_order / 2 ||
-			     start_migratetype == MIGRATE_RECLAIMABLE ||
-			     page_group_by_mobility_disabled)) {
-				int pages;
-				pages = move_freepages_block(zone, page,
-								start_migratetype);
-
-				/* Claim the whole block if over half of it is free */
-				if (pages >= (1 << (pageblock_order-1)) ||
-						page_group_by_mobility_disabled)
-					set_pageblock_migratetype(page,
-								start_migratetype);
-
-				migratetype = start_migratetype;
-			}
+			new_type = try_to_steal_freepages(zone, page,
+							  start_migratetype,
+							  migratetype);
 
 			/* Remove the page from the freelists */
 			list_del(&page->lru);
 			rmv_page_order(page);
 
-			/* Take ownership for orders >= pageblock_order */
-			if (current_order >= pageblock_order &&
-			    !is_migrate_cma(migratetype))
-				change_pageblock_range(page, current_order,
-							start_migratetype);
-
+			/*
+			 * Borrow the excess buddy pages as well, irrespective
+			 * of whether we stole freepages, or took ownership of
+			 * the pageblock or not.
+			 *
+			 * Exception: When borrowing from MIGRATE_CMA, release
+			 * the excess buddy pages to CMA itself.
+			 */
 			expand(zone, page, order, current_order, area,
 			       is_migrate_cma(migratetype)
 			     ? migratetype : start_migratetype);
 
 			trace_mm_page_alloc_extfrag(page, order, current_order,
-				start_migratetype, migratetype);
+				start_migratetype, new_type);
 
 			return page;
 		}

From f92310c1877fc73470bdcd9228758fa3713c191b Mon Sep 17 00:00:00 2001
From: "Srivatsa S. Bhat" <srivatsa.bhat@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:20:36 -0700
Subject: [PATCH 052/303] mm/page_alloc.c: fix the value of
 fallback_migratetype in alloc_extfrag tracepoint()

In the current code, the value of fallback_migratetype that is printed
using the mm_page_alloc_extfrag tracepoint, is the value of the
migratetype *after* it has been set to the preferred migratetype (if the
ownership was changed).  Obviously that wouldn't have been the original
intent.  (We already have a separate 'change_ownership' field to tell
whether the ownership of the pageblock was changed from the
fallback_migratetype to the preferred type.)

The intent of the fallback_migratetype field is to show the migratetype
from which we borrowed pages in order to satisfy the allocation request.
So fix the code to print that value correctly.

Signed-off-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/trace/events/kmem.h | 10 +++++++---
 mm/page_alloc.c             |  5 +++--
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 6bc943ecb841..d0c613476620 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -268,11 +268,13 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 
 	TP_PROTO(struct page *page,
 			int alloc_order, int fallback_order,
-			int alloc_migratetype, int fallback_migratetype),
+			int alloc_migratetype, int fallback_migratetype,
+			int change_ownership),
 
 	TP_ARGS(page,
 		alloc_order, fallback_order,
-		alloc_migratetype, fallback_migratetype),
+		alloc_migratetype, fallback_migratetype,
+		change_ownership),
 
 	TP_STRUCT__entry(
 		__field(	struct page *,	page			)
@@ -280,6 +282,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__field(	int,		fallback_order		)
 		__field(	int,		alloc_migratetype	)
 		__field(	int,		fallback_migratetype	)
+		__field(	int,		change_ownership	)
 	),
 
 	TP_fast_assign(
@@ -288,6 +291,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->fallback_order		= fallback_order;
 		__entry->alloc_migratetype	= alloc_migratetype;
 		__entry->fallback_migratetype	= fallback_migratetype;
+		__entry->change_ownership	= change_ownership;
 	),
 
 	TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d",
@@ -299,7 +303,7 @@ TRACE_EVENT(mm_page_alloc_extfrag,
 		__entry->alloc_migratetype,
 		__entry->fallback_migratetype,
 		__entry->fallback_order < pageblock_order,
-		__entry->alloc_migratetype == __entry->fallback_migratetype)
+		__entry->change_ownership)
 );
 
 #endif /* _TRACE_KMEM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b09ce5fe0cd2..2748fc6a9003 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1101,8 +1101,9 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
 			       is_migrate_cma(migratetype)
 			     ? migratetype : start_migratetype);
 
-			trace_mm_page_alloc_extfrag(page, order, current_order,
-				start_migratetype, new_type);
+			trace_mm_page_alloc_extfrag(page, order,
+				current_order, start_migratetype, migratetype,
+				new_type == start_migratetype);
 
 			return page;
 		}

From e2d0bd2b924d74d5e0d4f395f8f4730d125e198c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 11 Sep 2013 14:20:37 -0700
Subject: [PATCH 053/303] mm: kill one if loop in __free_pages_bootmem()

We should not check loop+1 with loop end in loop body.  Just duplicate two
lines code to avoid it.

That will help a bit when we have huge amount of pages on system with
16TiB memory.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2748fc6a9003..8c68ef13cefa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -751,19 +751,19 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 void __init __free_pages_bootmem(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
+	struct page *p = page;
 	unsigned int loop;
 
-	prefetchw(page);
-	for (loop = 0; loop < nr_pages; loop++) {
-		struct page *p = &page[loop];
-
-		if (loop + 1 < nr_pages)
-			prefetchw(p + 1);
+	prefetchw(p);
+	for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
+		prefetchw(p + 1);
 		__ClearPageReserved(p);
 		set_page_count(p, 0);
 	}
+	__ClearPageReserved(p);
+	set_page_count(p, 0);
 
-	page_zone(page)->managed_pages += 1 << order;
+	page_zone(page)->managed_pages += nr_pages;
 	set_page_refcounted(page);
 	__free_pages(page, order);
 }

From a8f531ebc33052642b4bd7b812eedf397108ce64 Mon Sep 17 00:00:00 2001
From: Libin <huawei.libin@huawei.com>
Date: Wed, 11 Sep 2013 14:20:38 -0700
Subject: [PATCH 054/303] mm/huge_memory.c: fix potential NULL pointer
 dereference

In collapse_huge_page() there is a race window between releasing the
mmap_sem read lock and taking the mmap_sem write lock, so find_vma() may
return NULL.  So check the return value to avoid NULL pointer dereference.

collapse_huge_page
	khugepaged_alloc_page
		up_read(&mm->mmap_sem)
	down_write(&mm->mmap_sem)
	vma = find_vma(mm, address)

Signed-off-by: Libin <huawei.libin@huawei.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: <stable@vger.kernel.org> # v3.0+
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8b7fc2025e04..963e14c0486f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2296,6 +2296,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 		goto out;
 
 	vma = find_vma(mm, address);
+	if (!vma)
+		goto out;
 	hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
 	hend = vma->vm_end & HPAGE_PMD_MASK;
 	if (address < hstart || address + HPAGE_PMD_SIZE > hend)

From 892f795df1eb119b560a3ee5a1ca3f385a852e84 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Sep 2013 14:20:39 -0700
Subject: [PATCH 055/303] mm: vmscan: fix numa reclaim balance problem in
 kswapd

The way the page allocator interacts with kswapd creates aging imbalances,
where the amount of time a userspace page gets in memory under reclaim
pressure is dependent on which zone, which node the allocator took the
page frame from.

#1 fixes missed kswapd wakeups on NUMA systems, which lead to some
   nodes falling behind for a full reclaim cycle relative to the other
   nodes in the system

#3 fixes an interaction where kswapd and a continuous stream of page
   allocations keep the preferred zone of a task between the high and
   low watermark (allocations succeed + kswapd does not go to sleep)
   indefinitely, completely underutilizing the lower zones and
   thrashing on the preferred zone

These patches are the aging fairness part of the thrash-detection based
file LRU balancing.  Andrea recommended to submit them separately as they
are bugfixes in their own right.

The following test ran a foreground workload (memcachetest) with
background IO of various sizes on a 4 node 8G system (similar results were
observed with single-node 4G systems):

parallelio
                                               BAS                    FAIRALLO
                                              BASE                   FAIRALLOC
Ops memcachetest-0M              5170.00 (  0.00%)           5283.00 (  2.19%)
Ops memcachetest-791M            4740.00 (  0.00%)           5293.00 ( 11.67%)
Ops memcachetest-2639M           2551.00 (  0.00%)           4950.00 ( 94.04%)
Ops memcachetest-4487M           2606.00 (  0.00%)           3922.00 ( 50.50%)
Ops io-duration-0M                  0.00 (  0.00%)              0.00 (  0.00%)
Ops io-duration-791M               55.00 (  0.00%)             18.00 ( 67.27%)
Ops io-duration-2639M             235.00 (  0.00%)            103.00 ( 56.17%)
Ops io-duration-4487M             278.00 (  0.00%)            173.00 ( 37.77%)
Ops swaptotal-0M                    0.00 (  0.00%)              0.00 (  0.00%)
Ops swaptotal-791M             245184.00 (  0.00%)              0.00 (  0.00%)
Ops swaptotal-2639M            468069.00 (  0.00%)         108778.00 ( 76.76%)
Ops swaptotal-4487M            452529.00 (  0.00%)          76623.00 ( 83.07%)
Ops swapin-0M                       0.00 (  0.00%)              0.00 (  0.00%)
Ops swapin-791M                108297.00 (  0.00%)              0.00 (  0.00%)
Ops swapin-2639M               169537.00 (  0.00%)          50031.00 ( 70.49%)
Ops swapin-4487M               167435.00 (  0.00%)          34178.00 ( 79.59%)
Ops minorfaults-0M            1518666.00 (  0.00%)        1503993.00 (  0.97%)
Ops minorfaults-791M          1676963.00 (  0.00%)        1520115.00 (  9.35%)
Ops minorfaults-2639M         1606035.00 (  0.00%)        1799717.00 (-12.06%)
Ops minorfaults-4487M         1612118.00 (  0.00%)        1583825.00 (  1.76%)
Ops majorfaults-0M                  6.00 (  0.00%)              0.00 (  0.00%)
Ops majorfaults-791M            13836.00 (  0.00%)             10.00 ( 99.93%)
Ops majorfaults-2639M           22307.00 (  0.00%)           6490.00 ( 70.91%)
Ops majorfaults-4487M           21631.00 (  0.00%)           4380.00 ( 79.75%)

                 BAS    FAIRALLO
                BASE   FAIRALLOC
User          287.78      460.97
System       2151.67     3142.51
Elapsed      9737.00     8879.34

                                   BAS    FAIRALLO
                                  BASE   FAIRALLOC
Minor Faults                  53721925    57188551
Major Faults                    392195       15157
Swap Ins                       2994854      112770
Swap Outs                      4907092      134982
Direct pages scanned                 0       41824
Kswapd pages scanned          32975063     8128269
Kswapd pages reclaimed         6323069     7093495
Direct pages reclaimed               0       41824
Kswapd efficiency                  19%         87%
Kswapd velocity               3386.573     915.414
Direct efficiency                 100%        100%
Direct velocity                  0.000       4.710
Percentage direct scans             0%          0%
Zone normal velocity          2011.338     550.661
Zone dma32 velocity           1365.623     369.221
Zone dma velocity                9.612       0.242
Page writes by reclaim    18732404.000  614807.000
Page writes file              13825312      479825
Page writes anon               4907092      134982
Page reclaim immediate           85490        5647
Sector Reads                  12080532      483244
Sector Writes                 88740508    65438876
Page rescued immediate               0           0
Slabs scanned                    82560       12160
Direct inode steals                  0           0
Kswapd inode steals              24401       40013
Kswapd skipped wait                  0           0
THP fault alloc                      6           8
THP collapse alloc                5481        5812
THP splits                          75          22
THP fault fallback                   0           0
THP collapse fail                    0           0
Compaction stalls                    0          54
Compaction success                   0          45
Compaction failures                  0           9
Page migrate success            881492       82278
Page migrate failure                 0           0
Compaction pages isolated            0       60334
Compaction migrate scanned           0       53505
Compaction free scanned              0     1537605
Compaction cost                    914          86
NUMA PTE updates              46738231    41988419
NUMA hint faults              31175564    24213387
NUMA hint local faults        10427393     6411593
NUMA pages migrated             881492       55344
AutoNUMA cost                   156221      121361

The overall runtime was reduced, throughput for both the foreground
workload as well as the background IO improved, major faults, swapping and
reclaim activity shrunk significantly, reclaim efficiency more than
quadrupled.

This patch:

When the page allocator fails to get a page from all zones in its given
zonelist, it wakes up the per-node kswapds for all zones that are at their
low watermark.

However, with a system under load the free pages in a zone can fluctuate
enough that the allocation fails but the kswapd wakeup is also skipped
while the zone is still really close to the low watermark.

When one node misses a wakeup like this, it won't be aged before all the
other node's zones are down to their low watermarks again.  And skipping a
full aging cycle is an obvious fairness problem.

Kswapd runs until the high watermarks are restored, so it should also be
woken when the high watermarks are not met.  This ages nodes more equally
and creates a safety margin for the page counter fluctuation.

By using zone_balanced(), it will now check, in addition to the watermark,
if compaction requires more order-0 pages to create a higher order page.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paul Bolle <paul.bollee@gmail.com>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2cff0d491c6d..758540d3ca83 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3237,7 +3237,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
 	}
 	if (!waitqueue_active(&pgdat->kswapd_wait))
 		return;
-	if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0))
+	if (zone_balanced(zone, order, 0, 0))
 		return;
 
 	trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);

From e085dbc52fad8d79fa2245339c84bf3ef0b3a802 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Sep 2013 14:20:46 -0700
Subject: [PATCH 056/303] mm: page_alloc: rearrange watermark checking in
 get_page_from_freelist

Allocations that do not have to respect the watermarks are rare
high-priority events.  Reorder the code such that per-zone dirty limits
and future checks important only to regular page allocations are ignored
in these extraordinary situations.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paul Bolle <paul.bollee@gmail.com>
Tested-by: Zlatko Calusic <zcalusic@bitsync.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8c68ef13cefa..9884aa0f233a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1889,12 +1889,17 @@ zonelist_scan:
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {
+		unsigned long mark;
+
 		if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
 			!zlc_zone_worth_trying(zonelist, z, allowednodes))
 				continue;
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
+		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
+		if (alloc_flags & ALLOC_NO_WATERMARKS)
+			goto try_this_zone;
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
@@ -1925,16 +1930,11 @@ zonelist_scan:
 		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
 			goto this_zone_full;
 
-		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
-		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-			unsigned long mark;
+		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+		if (!zone_watermark_ok(zone, order, mark,
+				       classzone_idx, alloc_flags)) {
 			int ret;
 
-			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-			if (zone_watermark_ok(zone, order, mark,
-				    classzone_idx, alloc_flags))
-				goto try_this_zone;
-
 			if (IS_ENABLED(CONFIG_NUMA) &&
 					!did_zlc_setup && nr_online_nodes > 1) {
 				/*

From 81c0a2bb515fd4daae8cab64352877480792b515 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Sep 2013 14:20:47 -0700
Subject: [PATCH 057/303] mm: page_alloc: fair zone allocator policy

Each zone that holds userspace pages of one workload must be aged at a
speed proportional to the zone size.  Otherwise, the time an individual
page gets to stay in memory depends on the zone it happened to be
allocated in.  Asymmetry in the zone aging creates rather unpredictable
aging behavior and results in the wrong pages being reclaimed, activated
etc.

But exactly this happens right now because of the way the page allocator
and kswapd interact.  The page allocator uses per-node lists of all zones
in the system, ordered by preference, when allocating a new page.  When
the first iteration does not yield any results, kswapd is woken up and the
allocator retries.  Due to the way kswapd reclaims zones below the high
watermark while a zone can be allocated from when it is above the low
watermark, the allocator may keep kswapd running while kswapd reclaim
ensures that the page allocator can keep allocating from the first zone in
the zonelist for extended periods of time.  Meanwhile the other zones
rarely see new allocations and thus get aged much slower in comparison.

The result is that the occasional page placed in lower zones gets
relatively more time in memory, even gets promoted to the active list
after its peers have long been evicted.  Meanwhile, the bulk of the
working set may be thrashing on the preferred zone even though there may
be significant amounts of memory available in the lower zones.

Even the most basic test -- repeatedly reading a file slightly bigger than
memory -- shows how broken the zone aging is.  In this scenario, no single
page should be able stay in memory long enough to get referenced twice and
activated, but activation happens in spades:

  $ grep active_file /proc/zoneinfo
      nr_inactive_file 0
      nr_active_file 0
      nr_inactive_file 0
      nr_active_file 8
      nr_inactive_file 1582
      nr_active_file 11994
  $ cat data data data data >/dev/null
  $ grep active_file /proc/zoneinfo
      nr_inactive_file 0
      nr_active_file 70
      nr_inactive_file 258753
      nr_active_file 443214
      nr_inactive_file 149793
      nr_active_file 12021

Fix this with a very simple round robin allocator.  Each zone is allowed a
batch of allocations that is proportional to the zone's size, after which
it is treated as full.  The batch counters are reset when all zones have
been tried and the allocator enters the slowpath and kicks off kswapd
reclaim.  Allocation and reclaim is now fairly spread out to all
available/allowable zones:

  $ grep active_file /proc/zoneinfo
      nr_inactive_file 0
      nr_active_file 0
      nr_inactive_file 174
      nr_active_file 4865
      nr_inactive_file 53
      nr_active_file 860
  $ cat data data data data >/dev/null
  $ grep active_file /proc/zoneinfo
      nr_inactive_file 0
      nr_active_file 0
      nr_inactive_file 666622
      nr_active_file 4988
      nr_inactive_file 190969
      nr_active_file 937

When zone_reclaim_mode is enabled, allocations will now spread out to all
zones on the local node, not just the first preferred zone (which on a 4G
node might be a tiny Normal zone).

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Paul Bolle <paul.bollee@gmail.com>
Cc: Zlatko Calusic <zcalusic@bitsync.net>
Tested-by: Kevin Hilman <khilman@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        | 72 ++++++++++++++++++++++++++++++++++++------
 mm/vmstat.c            |  1 +
 3 files changed, 64 insertions(+), 10 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index af4a3b77a8de..ac1ea796ec0f 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -105,6 +105,7 @@ struct zone_padding {
 enum zone_stat_item {
 	/* First 128 byte cacheline (assuming 64 bit words) */
 	NR_FREE_PAGES,
+	NR_ALLOC_BATCH,
 	NR_LRU_BASE,
 	NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
 	NR_ACTIVE_ANON,		/*  "     "     "   "       "         */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9884aa0f233a..544d19d681a2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1551,6 +1551,7 @@ again:
 					  get_pageblock_migratetype(page));
 	}
 
+	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	__count_zone_vm_events(PGALLOC, zone, 1 << order);
 	zone_statistics(preferred_zone, zone, gfp_flags);
 	local_irq_restore(flags);
@@ -1817,6 +1818,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 	bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
 }
 
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+	return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE;
+}
+
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes);
@@ -1854,6 +1860,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist)
 {
 }
 
+static bool zone_local(struct zone *local_zone, struct zone *zone)
+{
+	return true;
+}
+
 static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
 {
 	return true;
@@ -1900,6 +1911,26 @@ zonelist_scan:
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (alloc_flags & ALLOC_NO_WATERMARKS)
 			goto try_this_zone;
+		/*
+		 * Distribute pages in proportion to the individual
+		 * zone size to ensure fair page aging.  The zone a
+		 * page was allocated in should have no effect on the
+		 * time the page has in memory before being reclaimed.
+		 *
+		 * When zone_reclaim_mode is enabled, try to stay in
+		 * local zones in the fastpath.  If that fails, the
+		 * slowpath is entered, which will do another pass
+		 * starting with the local zones, but ultimately fall
+		 * back to remote zones that do not partake in the
+		 * fairness round-robin cycle of this zonelist.
+		 */
+		if (alloc_flags & ALLOC_WMARK_LOW) {
+			if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
+				continue;
+			if (zone_reclaim_mode &&
+			    !zone_local(preferred_zone, zone))
+				continue;
+		}
 		/*
 		 * When allocating a page cache page for writing, we
 		 * want to get it from a zone that is within its dirty
@@ -2346,16 +2377,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
 	return page;
 }
 
-static inline
-void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
-						enum zone_type high_zoneidx,
-						enum zone_type classzone_idx)
+static void prepare_slowpath(gfp_t gfp_mask, unsigned int order,
+			     struct zonelist *zonelist,
+			     enum zone_type high_zoneidx,
+			     struct zone *preferred_zone)
 {
 	struct zoneref *z;
 	struct zone *zone;
 
-	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
-		wakeup_kswapd(zone, order, classzone_idx);
+	for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+		if (!(gfp_mask & __GFP_NO_KSWAPD))
+			wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+		/*
+		 * Only reset the batches of zones that were actually
+		 * considered in the fast path, we don't want to
+		 * thrash fairness information for zones that are not
+		 * actually part of this zonelist's round-robin cycle.
+		 */
+		if (zone_reclaim_mode && !zone_local(preferred_zone, zone))
+			continue;
+		mod_zone_page_state(zone, NR_ALLOC_BATCH,
+				    high_wmark_pages(zone) -
+				    low_wmark_pages(zone) -
+				    zone_page_state(zone, NR_ALLOC_BATCH));
+	}
 }
 
 static inline int
@@ -2451,9 +2496,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 		goto nopage;
 
 restart:
-	if (!(gfp_mask & __GFP_NO_KSWAPD))
-		wake_all_kswapd(order, zonelist, high_zoneidx,
-						zone_idx(preferred_zone));
+	prepare_slowpath(gfp_mask, order, zonelist,
+			 high_zoneidx, preferred_zone);
 
 	/*
 	 * OK, we're below the kswapd watermark and have kicked background
@@ -4753,8 +4797,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
 		spin_lock_init(&zone->lru_lock);
 		zone_seqlock_init(zone);
 		zone->zone_pgdat = pgdat;
-
 		zone_pcp_init(zone);
+
+		/* For bootup, initialized properly in watermark setup */
+		mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
+
 		lruvec_init(&zone->lruvec);
 		if (!size)
 			continue;
@@ -5525,6 +5572,11 @@ static void __setup_per_zone_wmarks(void)
 		zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + (tmp >> 2);
 		zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
 
+		__mod_zone_page_state(zone, NR_ALLOC_BATCH,
+				      high_wmark_pages(zone) -
+				      low_wmark_pages(zone) -
+				      zone_page_state(zone, NR_ALLOC_BATCH));
+
 		setup_zone_migrate_reserve(zone);
 		spin_unlock_irqrestore(&zone->lock, flags);
 	}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index ca06e9653827..8a8da1f9b044 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -703,6 +703,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
 const char * const vmstat_text[] = {
 	/* Zoned VM counters */
 	"nr_free_pages",
+	"nr_alloc_batch",
 	"nr_inactive_anon",
 	"nr_active_anon",
 	"nr_inactive_file",

From 72457c0a05ed06f978d3a8a7c9d5ad527db88b4c Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 11 Sep 2013 14:20:49 -0700
Subject: [PATCH 058/303] mm: revert "page-writeback.c: subtract
 min_free_kbytes from dirtyable memory"

This reverts commit 75f7ad8e043d.  It was the result of a problem
observed with a 3.2 kernel and merged in 3.9, while the issue had been
resolved upstream in 3.3 (commit ab8fabd46f81: "mm: exclude reserved
pages from dirtyable memory").

The "reserved pages" are a superset of min_free_kbytes, thus this change
is redundant and confusing.  Revert it.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Paul Szabo <psz@maths.usyd.edu.au>
Cc: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page-writeback.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3f0c895c71fe..d374b29296dd 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -241,9 +241,6 @@ static unsigned long global_dirtyable_memory(void)
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
 
-	/* Subtract min_free_kbytes */
-	x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10));
-
 	return x + 1;	/* Ensure that we never return 0 */
 }
 

From 9966c4bbb110003ee218c5c4df583041b57027c4 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:20:50 -0700
Subject: [PATCH 059/303] mm, hugetlb: move up the code which check
 availability of free huge page

In this time we are holding a hugetlb_lock, so hstate values can't be
changed.  If we don't have any usable free huge page in this time, we
don't need to proceed with the processing.  So move this code up.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6e514831bda5..a87903578810 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	struct zoneref *z;
 	unsigned int cpuset_mems_cookie;
 
-retry_cpuset:
-	cpuset_mems_cookie = get_mems_allowed();
-	zonelist = huge_zonelist(vma, address,
-					htlb_alloc_mask, &mpol, &nodemask);
 	/*
 	 * A child process with MAP_PRIVATE mappings created by their parent
 	 * have no page reserves. This check ensures that reservations are
@@ -556,6 +552,11 @@ retry_cpuset:
 	if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
 		goto err;
 
+retry_cpuset:
+	cpuset_mems_cookie = get_mems_allowed();
+	zonelist = huge_zonelist(vma, address,
+					htlb_alloc_mask, &mpol, &nodemask);
+
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
@@ -574,7 +575,6 @@ retry_cpuset:
 	return page;
 
 err:
-	mpol_cond_put(mpol);
 	return NULL;
 }
 

From c748c26294600d7e4f0d1f2f61449d3a740f102f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:20:57 -0700
Subject: [PATCH 060/303] mm, hugetlb: trivial commenting fix

The name of the mutex written in comment is wrong.  Fix it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Hillf Danton <dhillf@gmail.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a87903578810..bfca1b00b09b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -135,9 +135,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
  *                    across the pages in a mapping.
  *
  * The region data structures are protected by a combination of the mmap_sem
- * and the hugetlb_instantion_mutex.  To access or modify a region the caller
+ * and the hugetlb_instantiation_mutex.  To access or modify a region the caller
  * must either hold the mmap_sem for write, or the mmap_sem for read and
- * the hugetlb_instantiation mutex:
+ * the hugetlb_instantiation_mutex:
  *
  *	down_write(&mm->mmap_sem);
  * or

From 81a6fcae3ff3f6af1c9d7e31499e68fda2b3f58d Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:20:58 -0700
Subject: [PATCH 061/303] mm, hugetlb: clean-up alloc_huge_page()

Unify successful allocation paths to make the code more readable.  There
are no functional changes.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bfca1b00b09b..a698d40d1c3e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1166,12 +1166,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	}
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
-	if (page) {
-		/* update page cgroup details */
-		hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
-					     h_cg, page);
-		spin_unlock(&hugetlb_lock);
-	} else {
+	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
@@ -1182,11 +1177,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 			return ERR_PTR(-ENOSPC);
 		}
 		spin_lock(&hugetlb_lock);
-		hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h),
-					     h_cg, page);
 		list_move(&page->lru, &h->hugepage_activelist);
-		spin_unlock(&hugetlb_lock);
+		/* Fall through */
 	}
+	hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+	spin_unlock(&hugetlb_lock);
 
 	set_page_private(page, (unsigned long)spool);
 

From b2261026825ed34066b24069359d118098bb1876 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:00 -0700
Subject: [PATCH 062/303] mm, hugetlb: fix and clean-up node iteration code to
 alloc or free

Current node iteration code have a minor problem which do one more node
rotation if we can't succeed to allocate.  For example, if we start to
allocate at node 0, we stop to iterate at node 0.  Then we start to
allocate at node 1 for next allocation.

I introduce new macros "for_each_node_mask_to_[alloc|free]" and fix and
clean-up node iteration code to alloc or free.  This makes code more
understandable.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 143 ++++++++++++++++++++++-----------------------------
 1 file changed, 61 insertions(+), 82 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a698d40d1c3e..08b7595fe3c1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -772,33 +772,6 @@ static int hstate_next_node_to_alloc(struct hstate *h,
 	return nid;
 }
 
-static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
-{
-	struct page *page;
-	int start_nid;
-	int next_nid;
-	int ret = 0;
-
-	start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
-	next_nid = start_nid;
-
-	do {
-		page = alloc_fresh_huge_page_node(h, next_nid);
-		if (page) {
-			ret = 1;
-			break;
-		}
-		next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
-	} while (next_nid != start_nid);
-
-	if (ret)
-		count_vm_event(HTLB_BUDDY_PGALLOC);
-	else
-		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
-
-	return ret;
-}
-
 /*
  * helper for free_pool_huge_page() - return the previously saved
  * node ["this node"] from which to free a huge page.  Advance the
@@ -817,6 +790,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 	return nid;
 }
 
+#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask)		\
+	for (nr_nodes = nodes_weight(*mask);				\
+		nr_nodes > 0 &&						\
+		((node = hstate_next_node_to_alloc(hs, mask)) || 1);	\
+		nr_nodes--)
+
+#define for_each_node_mask_to_free(hs, nr_nodes, node, mask)		\
+	for (nr_nodes = nodes_weight(*mask);				\
+		nr_nodes > 0 &&						\
+		((node = hstate_next_node_to_free(hs, mask)) || 1);	\
+		nr_nodes--)
+
+static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+{
+	struct page *page;
+	int nr_nodes, node;
+	int ret = 0;
+
+	for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+		page = alloc_fresh_huge_page_node(h, node);
+		if (page) {
+			ret = 1;
+			break;
+		}
+	}
+
+	if (ret)
+		count_vm_event(HTLB_BUDDY_PGALLOC);
+	else
+		count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+
+	return ret;
+}
+
 /*
  * Free huge page from pool from next node to free.
  * Attempt to keep persistent huge pages more or less
@@ -826,36 +833,31 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
 static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 							 bool acct_surplus)
 {
-	int start_nid;
-	int next_nid;
+	int nr_nodes, node;
 	int ret = 0;
 
-	start_nid = hstate_next_node_to_free(h, nodes_allowed);
-	next_nid = start_nid;
-
-	do {
+	for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
 		/*
 		 * If we're returning unused surplus pages, only examine
 		 * nodes with surplus pages.
 		 */
-		if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
-		    !list_empty(&h->hugepage_freelists[next_nid])) {
+		if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
+		    !list_empty(&h->hugepage_freelists[node])) {
 			struct page *page =
-				list_entry(h->hugepage_freelists[next_nid].next,
+				list_entry(h->hugepage_freelists[node].next,
 					  struct page, lru);
 			list_del(&page->lru);
 			h->free_huge_pages--;
-			h->free_huge_pages_node[next_nid]--;
+			h->free_huge_pages_node[node]--;
 			if (acct_surplus) {
 				h->surplus_huge_pages--;
-				h->surplus_huge_pages_node[next_nid]--;
+				h->surplus_huge_pages_node[node]--;
 			}
 			update_and_free_page(h, page);
 			ret = 1;
 			break;
 		}
-		next_nid = hstate_next_node_to_free(h, nodes_allowed);
-	} while (next_nid != start_nid);
+	}
 
 	return ret;
 }
@@ -1192,14 +1194,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
 	struct huge_bootmem_page *m;
-	int nr_nodes = nodes_weight(node_states[N_MEMORY]);
+	int nr_nodes, node;
 
-	while (nr_nodes) {
+	for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
 		void *addr;
 
-		addr = __alloc_bootmem_node_nopanic(
-				NODE_DATA(hstate_next_node_to_alloc(h,
-						&node_states[N_MEMORY])),
+		addr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
 				huge_page_size(h), huge_page_size(h), 0);
 
 		if (addr) {
@@ -1211,7 +1211,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
 			m = addr;
 			goto found;
 		}
-		nr_nodes--;
 	}
 	return 0;
 
@@ -1350,48 +1349,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count,
 static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 				int delta)
 {
-	int start_nid, next_nid;
-	int ret = 0;
+	int nr_nodes, node;
 
 	VM_BUG_ON(delta != -1 && delta != 1);
 
-	if (delta < 0)
-		start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
-	else
-		start_nid = hstate_next_node_to_free(h, nodes_allowed);
-	next_nid = start_nid;
-
-	do {
-		int nid = next_nid;
-		if (delta < 0)  {
-			/*
-			 * To shrink on this node, there must be a surplus page
-			 */
-			if (!h->surplus_huge_pages_node[nid]) {
-				next_nid = hstate_next_node_to_alloc(h,
-								nodes_allowed);
-				continue;
-			}
+	if (delta < 0) {
+		for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
+			if (h->surplus_huge_pages_node[node])
+				goto found;
 		}
-		if (delta > 0) {
-			/*
-			 * Surplus cannot exceed the total number of pages
-			 */
-			if (h->surplus_huge_pages_node[nid] >=
-						h->nr_huge_pages_node[nid]) {
-				next_nid = hstate_next_node_to_free(h,
-								nodes_allowed);
-				continue;
-			}
+	} else {
+		for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
+			if (h->surplus_huge_pages_node[node] <
+					h->nr_huge_pages_node[node])
+				goto found;
 		}
+	}
+	return 0;
 
-		h->surplus_huge_pages += delta;
-		h->surplus_huge_pages_node[nid] += delta;
-		ret = 1;
-		break;
-	} while (next_nid != start_nid);
-
-	return ret;
+found:
+	h->surplus_huge_pages += delta;
+	h->surplus_huge_pages_node[node] += delta;
+	return 1;
 }
 
 #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)

From c0d934ba278935fa751057091fe4a7c02d814f68 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:02 -0700
Subject: [PATCH 063/303] mm, hugetlb: remove redundant list_empty check in
 gather_surplus_pages()

If list is empty, list_for_each_entry_safe() doesn't do anything.  So,
this check is redundant.  Remove it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 08b7595fe3c1..a13be48b818b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1037,11 +1037,8 @@ free:
 	spin_unlock(&hugetlb_lock);
 
 	/* Free unnecessary surplus pages to the buddy allocator */
-	if (!list_empty(&surplus_list)) {
-		list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
-			put_page(page);
-		}
-	}
+	list_for_each_entry_safe(page, tmp, &surplus_list, lru)
+		put_page(page);
 	spin_lock(&hugetlb_lock);
 
 	return ret;

From 37a2140dc2145a6f154172286944a1861e978dfd Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:04 -0700
Subject: [PATCH 064/303] mm, hugetlb: do not use a page in page cache for cow
 optimization

Currently, we use a page with mapped count 1 in page cache for cow
optimization.  If we find this condition, we don't allocate a new page and
copy contents.  Instead, we map this page directly.  This may introduce a
problem that writting to private mapping overwrite hugetlb file directly.
You can find this situation with following code.

        size = 20 * MB;
        flag = MAP_SHARED;
        p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
        if (p == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
                return -1;
        }
        p[0] = 's';
        fprintf(stdout, "BEFORE STEAL PRIVATE WRITE: %c\n", p[0]);
        munmap(p, size);

        flag = MAP_PRIVATE;
        p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
        if (p == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
        }
        p[0] = 'c';
        munmap(p, size);

        flag = MAP_SHARED;
        p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
        if (p == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
                return -1;
        }
        fprintf(stdout, "AFTER STEAL PRIVATE WRITE: %c\n", p[0]);
        munmap(p, size);

We can see that "AFTER STEAL PRIVATE WRITE: c", not "AFTER STEAL PRIVATE
WRITE: s".  If we turn off this optimization to a page in page cache, the
problem is disappeared.

So, I change the trigger condition of optimization.  If this page is not
AnonPage, we don't do optimization.  This makes this optimization turning
off for a page cache.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a13be48b818b..da027a3307af 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2528,7 +2528,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
-	int avoidcopy;
 	int outside_reserve = 0;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
@@ -2538,10 +2537,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 retry_avoidcopy:
 	/* If no-one else is actually using this page, avoid the copy
 	 * and just make the page writable */
-	avoidcopy = (page_mapcount(old_page) == 1);
-	if (avoidcopy) {
-		if (PageAnon(old_page))
-			page_move_anon_rmap(old_page, vma, address);
+	if (page_mapcount(old_page) == 1 && PageAnon(old_page)) {
+		page_move_anon_rmap(old_page, vma, address);
 		set_huge_ptep_writable(vma, address, ptep);
 		return 0;
 	}

From 72231b03ccf126ca04fba8359998ef7dfd195577 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:06 -0700
Subject: [PATCH 065/303] mm, hugetlb: add VM_NORESERVE check in
 vma_has_reserves()

If we map the region with MAP_NORESERVE and MAP_SHARED, we can skip to
check reserve counting and eventually we cannot be ensured to allocate a
huge page in fault time.  With following example code, you can easily find
this situation.

Assume 2MB, nr_hugepages = 100

        fd = hugetlbfs_unlinked_fd();
        if (fd < 0)
                return 1;

        size = 200 * MB;
        flag = MAP_SHARED;
        p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
        if (p == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
                return -1;
        }

        size = 2 * MB;
        flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE;
        p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0);
        if (p == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
        }
        p[0] = '0';
        sleep(10);

During executing sleep(10), run 'cat /proc/meminfo' on another process.

HugePages_Free:       99
HugePages_Rsvd:      100

Number of free should be higher or equal than number of reserve, but this
aren't.  This represent that non reserved shared mapping steal a reserved
page.  Non reserved shared mapping should not eat into reserve space.

If we consider VM_NORESERVE in vma_has_reserve() and return 0 which mean
that we don't have reserved pages, then we check that we have enough free
pages in dequeue_huge_page_vma().  This prevent to steal a reserved page.

With this change, above test generate a SIGBUG which is correct, because
all free pages are reserved and non reserved shared mapping can't get a
free page.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index da027a3307af..cb134e6a9fee 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -464,6 +464,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 /* Returns true if the VMA has associated reserve pages */
 static int vma_has_reserves(struct vm_area_struct *vma)
 {
+	if (vma->vm_flags & VM_NORESERVE)
+		return 0;
 	if (vma->vm_flags & VM_MAYSHARE)
 		return 1;
 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))

From a63884e921cb33a6beb260fa88bcbf1712d98a9a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:07 -0700
Subject: [PATCH 066/303] mm, hugetlb: remove decrement_hugepage_resv_vma()

Now, Checking condition of decrement_hugepage_resv_vma() and
vma_has_reserves() is same, so we can clean-up this function with
vma_has_reserves().  Additionally, decrement_hugepage_resv_vma() has only
one call site, so we can remove function and embed it into
dequeue_huge_page_vma() directly.  This patch implement it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cb134e6a9fee..dacf0d2256d9 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -434,25 +434,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
 	return (get_vma_private_data(vma) & flag) != 0;
 }
 
-/* Decrement the reserved pages in the hugepage pool by one */
-static void decrement_hugepage_resv_vma(struct hstate *h,
-			struct vm_area_struct *vma)
-{
-	if (vma->vm_flags & VM_NORESERVE)
-		return;
-
-	if (vma->vm_flags & VM_MAYSHARE) {
-		/* Shared mappings always use reserves */
-		h->resv_huge_pages--;
-	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
-		/*
-		 * Only the process that called mmap() has reserves for
-		 * private mappings.
-		 */
-		h->resv_huge_pages--;
-	}
-}
-
 /* Reset counters to 0 and clear all HPAGE_RESV_* flags */
 void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 {
@@ -466,10 +447,18 @@ static int vma_has_reserves(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & VM_NORESERVE)
 		return 0;
+
+	/* Shared mappings always use reserves */
 	if (vma->vm_flags & VM_MAYSHARE)
 		return 1;
+
+	/*
+	 * Only the process that called mmap() has reserves for
+	 * private mappings.
+	 */
 	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER))
 		return 1;
+
 	return 0;
 }
 
@@ -564,8 +553,8 @@ retry_cpuset:
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
 			if (page) {
-				if (!avoid_reserve)
-					decrement_hugepage_resv_vma(h, vma);
+				if (!avoid_reserve && vma_has_reserves(vma))
+					h->resv_huge_pages--;
 				break;
 			}
 		}

From af0ed73e699bb0453603b1d1a4727377641b2096 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:18 -0700
Subject: [PATCH 067/303] mm, hugetlb: decrement reserve count if VM_NORESERVE
 alloc page cache

If a vma with VM_NORESERVE allocate a new page for page cache, we should
check whether this area is reserved or not.  If this address is already
reserved by other process(in case of chg == 0), we should decrement
reserve count, because this allocated page will go into page cache and
currently, there is no way to know that this page comes from reserved pool
or not when releasing inode.  This may introduce over-counting problem to
reserved count.  With following example code, you can easily reproduce
this situation.

Assume 2MB, nr_hugepages = 100

        size = 20 * MB;
        flag = MAP_SHARED;
        p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
        if (p == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
                return -1;
        }

        flag = MAP_SHARED | MAP_NORESERVE;
        q = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0);
        if (q == MAP_FAILED) {
                fprintf(stderr, "mmap() failed: %s\n", strerror(errno));
        }
        q[0] = 'c';

After finish the program, run 'cat /proc/meminfo'.  You can see below
result.

HugePages_Free:      100
HugePages_Rsvd:        1

To fix this, we should check our mapping type and tracked region.  If our
mapping is VM_NORESERVE, VM_MAYSHARE and chg is 0, this imply that current
allocated page will go into page cache which is already reserved region
when mapping is created.  In this case, we should decrease reserve count.
As implementing above, this patch solve the problem.

[akpm@linux-foundation.org: fix spelling in comment]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dacf0d2256d9..5b084c7b34c6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -443,10 +443,23 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
 }
 
 /* Returns true if the VMA has associated reserve pages */
-static int vma_has_reserves(struct vm_area_struct *vma)
+static int vma_has_reserves(struct vm_area_struct *vma, long chg)
 {
-	if (vma->vm_flags & VM_NORESERVE)
-		return 0;
+	if (vma->vm_flags & VM_NORESERVE) {
+		/*
+		 * This address is already reserved by other process(chg == 0),
+		 * so, we should decrement reserved count. Without decrementing,
+		 * reserve count remains after releasing inode, because this
+		 * allocated page will go into page cache and is regarded as
+		 * coming from reserved pool in releasing step.  Currently, we
+		 * don't have any other solution to deal with this situation
+		 * properly, so add work-around here.
+		 */
+		if (vma->vm_flags & VM_MAYSHARE && chg == 0)
+			return 1;
+		else
+			return 0;
+	}
 
 	/* Shared mappings always use reserves */
 	if (vma->vm_flags & VM_MAYSHARE)
@@ -520,7 +533,8 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 
 static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
-				unsigned long address, int avoid_reserve)
+				unsigned long address, int avoid_reserve,
+				long chg)
 {
 	struct page *page = NULL;
 	struct mempolicy *mpol;
@@ -535,7 +549,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 	 * have no page reserves. This check ensures that reservations are
 	 * not "stolen". The child may still get SIGKILLed
 	 */
-	if (!vma_has_reserves(vma) &&
+	if (!vma_has_reserves(vma, chg) &&
 			h->free_huge_pages - h->resv_huge_pages == 0)
 		goto err;
 
@@ -553,8 +567,12 @@ retry_cpuset:
 		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
 			if (page) {
-				if (!avoid_reserve && vma_has_reserves(vma))
-					h->resv_huge_pages--;
+				if (avoid_reserve)
+					break;
+				if (!vma_has_reserves(vma, chg))
+					break;
+
+				h->resv_huge_pages--;
 				break;
 			}
 		}
@@ -1155,7 +1173,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 		return ERR_PTR(-ENOSPC);
 	}
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve);
+	page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg);
 	if (!page) {
 		spin_unlock(&hugetlb_lock);
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);

From 1da6f0e1b316d0215989fe4d7c657edead1fdea7 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@huawei.com>
Date: Wed, 11 Sep 2013 14:21:25 -0700
Subject: [PATCH 068/303] mm/mempolicy: return NULL if node is NUMA_NO_NODE in
 get_task_policy

If node == NUMA_NO_NODE, pol is NULL, we should return NULL instead of
do "if (!pol->mode)" check.

[akpm@linux-foundation.org: reorganise code]
Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Hanjun Guo <guohanjun@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 6b1d426731ae..27022ca890f8 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
 static struct mempolicy *get_task_policy(struct task_struct *p)
 {
 	struct mempolicy *pol = p->mempolicy;
-	int node;
 
 	if (!pol) {
-		node = numa_node_id();
-		if (node != NUMA_NO_NODE)
-			pol = &preferred_node_policy[node];
+		int node = numa_node_id();
 
-		/* preferred_node_policy is not initialised early in boot */
-		if (!pol->mode)
-			pol = NULL;
+		if (node != NUMA_NO_NODE) {
+			pol = &preferred_node_policy[node];
+			/*
+			 * preferred_node_policy is not initialised early in
+			 * boot
+			 */
+			if (!pol->mode)
+				pol = NULL;
+		}
 	}
 
 	return pol;

From e66f09725771ac5b1b868d6b19eba84e30ffad88 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:26 -0700
Subject: [PATCH 069/303] mm, page_alloc: add unlikely macro to help compiler
 optimization

We rarely allocate a page with ALLOC_NO_WATERMARKS and it is used in slow
path.  For helping compiler optimization, add unlikely macro to
ALLOC_NO_WATERMARKS checking.

This patch doesn't have any effect now, because gcc already optimize this
properly.  But we cannot assume that gcc always does right and nobody
re-evaluate if gcc do proper optimization with their change, for example,
it is not optimized properly on v3.10.  So adding compiler hint here is
reasonable.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 544d19d681a2..42c59300bacd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1909,7 +1909,7 @@ zonelist_scan:
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
-		if (alloc_flags & ALLOC_NO_WATERMARKS)
+		if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS))
 			goto try_this_zone;
 		/*
 		 * Distribute pages in proportion to the individual

From bc4b4448dba660afc8df3790564320302d9709a1 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:28 -0700
Subject: [PATCH 070/303] mm: move pgtable related functions to right place

pgtable related functions are mostly in pgtable-generic.c.
So move remaining functions from memory.c to pgtable-generic.c.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c          | 24 ------------------------
 mm/pgtable-generic.c | 24 ++++++++++++++++++++++++
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index b3c6bf9a398e..c1c6d59b2b03 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -372,30 +372,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 
 #endif /* CONFIG_HAVE_RCU_TABLE_FREE */
 
-/*
- * If a p?d_bad entry is found while walking page tables, report
- * the error, before resetting entry to p?d_none.  Usually (but
- * very seldom) called out from the p?d_none_or_clear_bad macros.
- */
-
-void pgd_clear_bad(pgd_t *pgd)
-{
-	pgd_ERROR(*pgd);
-	pgd_clear(pgd);
-}
-
-void pud_clear_bad(pud_t *pud)
-{
-	pud_ERROR(*pud);
-	pud_clear(pud);
-}
-
-void pmd_clear_bad(pmd_t *pmd)
-{
-	pmd_ERROR(*pmd);
-	pmd_clear(pmd);
-}
-
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index e1a6e4fab016..3929a40bd6c0 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -10,6 +10,30 @@
 #include <asm/tlb.h>
 #include <asm-generic/pgtable.h>
 
+/*
+ * If a p?d_bad entry is found while walking page tables, report
+ * the error, before resetting entry to p?d_none.  Usually (but
+ * very seldom) called out from the p?d_none_or_clear_bad macros.
+ */
+
+void pgd_clear_bad(pgd_t *pgd)
+{
+	pgd_ERROR(*pgd);
+	pgd_clear(pgd);
+}
+
+void pud_clear_bad(pud_t *pud)
+{
+	pud_ERROR(*pud);
+	pud_clear(pud);
+}
+
+void pmd_clear_bad(pmd_t *pmd)
+{
+	pmd_ERROR(*pmd);
+	pmd_clear(pmd);
+}
+
 #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 /*
  * Only sets the access flags (dirty, accessed), as well as write 

From d2cf5ad6312ca9913464fac40fb47ba47ad945c4 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:29 -0700
Subject: [PATCH 071/303] swap: clean-up #ifdef in page_mapping()

PageSwapCache() is always false when !CONFIG_SWAP, so compiler
properly discard related code. Therefore, we don't need #ifdef explicitly.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 1 +
 mm/util.c            | 5 +----
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 24db9142e93b..c03c139219c9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -447,6 +447,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
 
 #else /* CONFIG_SWAP */
 
+#define swap_address_space(entry)		(NULL)
 #define get_nr_swap_pages()			0L
 #define total_swap_pages			0L
 #define total_swapcache_pages()			0UL
diff --git a/mm/util.c b/mm/util.c
index 7441c41d00f6..eaf63fc2c92f 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page)
 	struct address_space *mapping = page->mapping;
 
 	VM_BUG_ON(PageSlab(page));
-#ifdef CONFIG_SWAP
 	if (unlikely(PageSwapCache(page))) {
 		swp_entry_t entry;
 
 		entry.val = page_private(page);
 		mapping = swap_address_space(entry);
-	} else
-#endif
-	if ((unsigned long)mapping & PAGE_MAPPING_ANON)
+	} else if ((unsigned long)mapping & PAGE_MAPPING_ANON)
 		mapping = NULL;
 	return mapping;
 }

From 2bb921e526656556e68f99f5f15a4a1bf2691844 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 11 Sep 2013 14:21:30 -0700
Subject: [PATCH 072/303] vmstat: create separate function to fold per cpu
 diffs into local counters

The main idea behind this patchset is to reduce the vmstat update overhead
by avoiding interrupt enable/disable and the use of per cpu atomics.

This patch (of 3):

It is better to have a separate folding function because
refresh_cpu_vm_stats() also does other things like expire pages in the
page allocator caches.

If we have a separate function then refresh_cpu_vm_stats() is only called
from the local cpu which allows additional optimizations.

The folding function is only called when a cpu is being downed and
therefore no other processor will be accessing the counters.  Also
simplifies synchronization.

[akpm@linux-foundation.org: fix UP build]
Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Tejun Heo <tj@kernel.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  3 ++-
 mm/page_alloc.c        |  2 +-
 mm/vmstat.c            | 40 ++++++++++++++++++++++++++++++++++------
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index c586679b6fef..502767f4e4d4 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -198,7 +198,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item);
 extern void dec_zone_state(struct zone *, enum zone_stat_item);
 extern void __dec_zone_state(struct zone *, enum zone_stat_item);
 
-void refresh_cpu_vm_stats(int);
+void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
@@ -255,6 +255,7 @@ static inline void __dec_zone_page_state(struct page *page,
 
 static inline void refresh_cpu_vm_stats(int cpu) { }
 static inline void refresh_zone_stat_thresholds(void) { }
+static inline void cpu_vm_stats_fold(int cpu) { }
 
 static inline void drain_zonestat(struct zone *zone,
 			struct per_cpu_pageset *pset) { }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 42c59300bacd..f885eb827159 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5435,7 +5435,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
 		 * This is only okay since the processor is dead and cannot
 		 * race with what we are doing.
 		 */
-		refresh_cpu_vm_stats(cpu);
+		cpu_vm_stats_fold(cpu);
 	}
 	return NOTIFY_OK;
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8a8da1f9b044..aaee66330e01 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -415,11 +415,7 @@ EXPORT_SYMBOL(dec_zone_page_state);
 #endif
 
 /*
- * Update the zone counters for one cpu.
- *
- * The cpu specified must be either the current cpu or a processor that
- * is not online. If it is the current cpu then the execution thread must
- * be pinned to the current cpu.
+ * Update the zone counters for the current cpu.
  *
  * Note that refresh_cpu_vm_stats strives to only access
  * node local memory. The per cpu pagesets on remote zones are placed
@@ -432,7 +428,7 @@ EXPORT_SYMBOL(dec_zone_page_state);
  * with the global counters. These could cause remote node cache line
  * bouncing and will have to be only done when necessary.
  */
-void refresh_cpu_vm_stats(int cpu)
+static void refresh_cpu_vm_stats(int cpu)
 {
 	struct zone *zone;
 	int i;
@@ -493,6 +489,38 @@ void refresh_cpu_vm_stats(int cpu)
 			atomic_long_add(global_diff[i], &vm_stat[i]);
 }
 
+/*
+ * Fold the data for an offline cpu into the global array.
+ * There cannot be any access by the offline cpu and therefore
+ * synchronization is simplified.
+ */
+void cpu_vm_stats_fold(int cpu)
+{
+	struct zone *zone;
+	int i;
+	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+
+	for_each_populated_zone(zone) {
+		struct per_cpu_pageset *p;
+
+		p = per_cpu_ptr(zone->pageset, cpu);
+
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+			if (p->vm_stat_diff[i]) {
+				int v;
+
+				v = p->vm_stat_diff[i];
+				p->vm_stat_diff[i] = 0;
+				atomic_long_add(v, &zone->vm_stat[i]);
+				global_diff[i] += v;
+			}
+	}
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (global_diff[i])
+			atomic_long_add(global_diff[i], &vm_stat[i]);
+}
+
 /*
  * this is only called if !populated_zone(zone), which implies no other users of
  * pset->vm_stat_diff[] exsist.

From 4edb0748b23887140578d68f5f4e6e2de337a481 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 11 Sep 2013 14:21:31 -0700
Subject: [PATCH 073/303] vmstat: create fold_diff

Both functions that update global counters use the same mechanism.

Create a function that contains the common code.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Tejun Heo <tj@kernel.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index aaee66330e01..158ca6494bc6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -414,6 +414,15 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
 EXPORT_SYMBOL(dec_zone_page_state);
 #endif
 
+static inline void fold_diff(int *diff)
+{
+	int i;
+
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+		if (diff[i])
+			atomic_long_add(diff[i], &vm_stat[i]);
+}
+
 /*
  * Update the zone counters for the current cpu.
  *
@@ -483,10 +492,7 @@ static void refresh_cpu_vm_stats(int cpu)
 			drain_zone_pages(zone, &p->pcp);
 #endif
 	}
-
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		if (global_diff[i])
-			atomic_long_add(global_diff[i], &vm_stat[i]);
+	fold_diff(global_diff);
 }
 
 /*
@@ -516,9 +522,7 @@ void cpu_vm_stats_fold(int cpu)
 			}
 	}
 
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		if (global_diff[i])
-			atomic_long_add(global_diff[i], &vm_stat[i]);
+	fold_diff(global_diff);
 }
 
 /*

From fbc2edb05354480a88aa39db8a6acb5782fa1a1b Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Wed, 11 Sep 2013 14:21:32 -0700
Subject: [PATCH 074/303] vmstat: use this_cpu() to avoid irqon/off sequence in
 refresh_cpu_vm_stats

Disabling interrupts repeatedly can be avoided in the inner loop if we use
a this_cpu operation.

Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
CC: Tejun Heo <tj@kernel.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 158ca6494bc6..d57a09143bf9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -437,33 +437,29 @@ static inline void fold_diff(int *diff)
  * with the global counters. These could cause remote node cache line
  * bouncing and will have to be only done when necessary.
  */
-static void refresh_cpu_vm_stats(int cpu)
+static void refresh_cpu_vm_stats(void)
 {
 	struct zone *zone;
 	int i;
 	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 
 	for_each_populated_zone(zone) {
-		struct per_cpu_pageset *p;
+		struct per_cpu_pageset __percpu *p = zone->pageset;
 
-		p = per_cpu_ptr(zone->pageset, cpu);
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+			int v;
 
-		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-			if (p->vm_stat_diff[i]) {
-				unsigned long flags;
-				int v;
+			v = this_cpu_xchg(p->vm_stat_diff[i], 0);
+			if (v) {
 
-				local_irq_save(flags);
-				v = p->vm_stat_diff[i];
-				p->vm_stat_diff[i] = 0;
-				local_irq_restore(flags);
 				atomic_long_add(v, &zone->vm_stat[i]);
 				global_diff[i] += v;
 #ifdef CONFIG_NUMA
 				/* 3 seconds idle till flush */
-				p->expire = 3;
+				__this_cpu_write(p->expire, 3);
 #endif
 			}
+		}
 		cond_resched();
 #ifdef CONFIG_NUMA
 		/*
@@ -473,23 +469,24 @@ static void refresh_cpu_vm_stats(int cpu)
 		 * Check if there are pages remaining in this pageset
 		 * if not then there is nothing to expire.
 		 */
-		if (!p->expire || !p->pcp.count)
+		if (!__this_cpu_read(p->expire) ||
+			       !__this_cpu_read(p->pcp.count))
 			continue;
 
 		/*
 		 * We never drain zones local to this processor.
 		 */
 		if (zone_to_nid(zone) == numa_node_id()) {
-			p->expire = 0;
+			__this_cpu_write(p->expire, 0);
 			continue;
 		}
 
-		p->expire--;
-		if (p->expire)
+
+		if (__this_cpu_dec_return(p->expire))
 			continue;
 
-		if (p->pcp.count)
-			drain_zone_pages(zone, &p->pcp);
+		if (__this_cpu_read(p->pcp.count))
+			drain_zone_pages(zone, __this_cpu_ptr(&p->pcp));
 #endif
 	}
 	fold_diff(global_diff);
@@ -1216,7 +1213,7 @@ int sysctl_stat_interval __read_mostly = HZ;
 
 static void vmstat_update(struct work_struct *w)
 {
-	refresh_cpu_vm_stats(smp_processor_id());
+	refresh_cpu_vm_stats();
 	schedule_delayed_work(&__get_cpu_var(vmstat_work),
 		round_jiffies_relative(sysctl_stat_interval));
 }

From 6b70f7dff8f7ce2f4692afc7d4ef9f73f8c82434 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:39 -0700
Subject: [PATCH 075/303] mm, vmalloc: remove useless variable in vmap_block

vbq in vmap_block isn't used. So remove it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13a54953a273..d23c43258727 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -752,7 +752,6 @@ struct vmap_block_queue {
 struct vmap_block {
 	spinlock_t lock;
 	struct vmap_area *va;
-	struct vmap_block_queue *vbq;
 	unsigned long free, dirty;
 	DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
 	struct list_head free_list;
@@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
 	radix_tree_preload_end();
 
 	vbq = &get_cpu_var(vmap_block_queue);
-	vb->vbq = vbq;
 	spin_lock(&vbq->lock);
 	list_add_rcu(&vb->free_list, &vbq->free);
 	spin_unlock(&vbq->lock);

From b136be5e0b6e8e3e4dcb6722b51bb35199b06810 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:40 -0700
Subject: [PATCH 076/303] mm, vmalloc: use well-defined find_last_bit() func

Our intention in here is to find last_bit within the region to flush.
There is well-defined function, find_last_bit() for this purpose and its
performance may be slightly better than current implementation.  So change
it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d23c43258727..93d3182c3300 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1016,15 +1016,16 @@ void vm_unmap_aliases(void)
 
 		rcu_read_lock();
 		list_for_each_entry_rcu(vb, &vbq->free, free_list) {
-			int i;
+			int i, j;
 
 			spin_lock(&vb->lock);
 			i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
-			while (i < VMAP_BBMAP_BITS) {
+			if (i < VMAP_BBMAP_BITS) {
 				unsigned long s, e;
-				int j;
-				j = find_next_zero_bit(vb->dirty_map,
-					VMAP_BBMAP_BITS, i);
+
+				j = find_last_bit(vb->dirty_map,
+							VMAP_BBMAP_BITS);
+				j = j + 1; /* need exclusive index */
 
 				s = vb->va->va_start + (i << PAGE_SHIFT);
 				e = vb->va->va_start + (j << PAGE_SHIFT);
@@ -1034,10 +1035,6 @@ void vm_unmap_aliases(void)
 					start = s;
 				if (e > end)
 					end = e;
-
-				i = j;
-				i = find_next_bit(vb->dirty_map,
-							VMAP_BBMAP_BITS, i);
 			}
 			spin_unlock(&vb->lock);
 		}

From 37b000b640741132eddaa9fbeca1f988139ad7e2 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 11 Sep 2013 14:21:41 -0700
Subject: [PATCH 077/303] mm/hotplug: remove unnecessary BUG_ON in
 __offline_pages()

I think we can remove "BUG_ON(start_pfn >= end_pfn)" in __offline_pages(),
because in memory_block_action() "nr_pages = PAGES_PER_SECTION * sections_per_block"
is always greater than 0.

memory_block_action()
	offline_pages()
		__offline_pages()
			BUG_ON(start_pfn >= end_pfn)

In v2.6.32, If info->length==0, this way may hit this BUG_ON().
acpi_memory_disable_device()
	remove_memory(info->start_addr, info->length)
			offline_pages()

A later Fujitsu patch renamed this function and the BUG_ON() is
unnecessary.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Reviewed-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ca1dd3aa5eee..8e333f953f08 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1472,7 +1472,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
 	struct zone *zone;
 	struct memory_notify arg;
 
-	BUG_ON(start_pfn >= end_pfn);
 	/* at least, alignment against pageblock is necessary */
 	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
 		return -EINVAL;

From eee87e1726af8c746f0e15ae6c57a97675f5e960 Mon Sep 17 00:00:00 2001
From: Jianguo Wu <wujianguo@huawei.com>
Date: Wed, 11 Sep 2013 14:21:42 -0700
Subject: [PATCH 078/303] mm/zbud: fix some trivial typos in comments

Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
Cc: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zbud.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/zbud.c b/mm/zbud.c
index ad1e781284fd..9451361e6aa7 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -16,7 +16,7 @@
  *
  * zbud works by storing compressed pages, or "zpages", together in pairs in a
  * single memory page called a "zbud page".  The first buddy is "left
- * justifed" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the beginning of the zbud page, and the last buddy is "right
  * justified" at the end of the zbud page.  The benefit is that if either
  * buddy is freed, the freed buddy space, coalesced with whatever slack space
  * that existed between the buddies, results in the largest possible free region
@@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool)
  * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
  * as zbud pool pages.
  *
- * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
  * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
  * a new page.
  */

From 674470d97958a0ec72f72caf7f6451da40159cc7 Mon Sep 17 00:00:00 2001
From: Joonyoung Shim <jy0922.shim@samsung.com>
Date: Wed, 11 Sep 2013 14:21:43 -0700
Subject: [PATCH 079/303] lib/genalloc.c: fix overflow of ending address of
 memory chunk

In struct gen_pool_chunk, end_addr means the end address of memory chunk
(inclusive), but in the implementation it is treated as address + size of
memory chunk (exclusive), so it points to the address plus one instead of
correct ending address.

The ending address of memory chunk plus one will cause overflow on the
memory chunk including the last address of memory map, e.g.  when starting
address is 0xFFF00000 and size is 0x100000 on 32bit machine, ending
address will be 0x100000000.

Use correct ending address like starting address + size - 1.

[akpm@linux-foundation.org: add comment to struct gen_pool_chunk:end_addr]
Signed-off-by: Joonyoung Shim <jy0922.shim@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h |  4 ++--
 lib/genalloc.c           | 19 ++++++++++++-------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index 661d374aeb2d..f8d41cb1cbe0 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -66,8 +66,8 @@ struct gen_pool_chunk {
 	struct list_head next_chunk;	/* next chunk in pool */
 	atomic_t avail;
 	phys_addr_t phys_addr;		/* physical starting address of memory chunk */
-	unsigned long start_addr;	/* starting address of memory chunk */
-	unsigned long end_addr;		/* ending address of memory chunk */
+	unsigned long start_addr;	/* start address of memory chunk */
+	unsigned long end_addr;		/* end address of memory chunk (inclusive) */
 	unsigned long bits[0];		/* bitmap for allocating memory chunk */
 };
 
diff --git a/lib/genalloc.c b/lib/genalloc.c
index b35cfa9bc3d4..2a39bf62d8c1 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -37,6 +37,11 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 
+static inline size_t chunk_size(const struct gen_pool_chunk *chunk)
+{
+	return chunk->end_addr - chunk->start_addr + 1;
+}
+
 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
 {
 	unsigned long val, nval;
@@ -188,7 +193,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 
 	chunk->phys_addr = phys;
 	chunk->start_addr = virt;
-	chunk->end_addr = virt + size;
+	chunk->end_addr = virt + size - 1;
 	atomic_set(&chunk->avail, size);
 
 	spin_lock(&pool->lock);
@@ -213,7 +218,7 @@ phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
-		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+		if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
 			paddr = chunk->phys_addr + (addr - chunk->start_addr);
 			break;
 		}
@@ -242,7 +247,7 @@ void gen_pool_destroy(struct gen_pool *pool)
 		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
 		list_del(&chunk->next_chunk);
 
-		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit = chunk_size(chunk) >> order;
 		bit = find_next_bit(chunk->bits, end_bit, 0);
 		BUG_ON(bit < end_bit);
 
@@ -283,7 +288,7 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 		if (size > atomic_read(&chunk->avail))
 			continue;
 
-		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit = chunk_size(chunk) >> order;
 retry:
 		start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits,
 				pool->data);
@@ -330,8 +335,8 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
-		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
-			BUG_ON(addr + size > chunk->end_addr);
+		if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
+			BUG_ON(addr + size - 1 > chunk->end_addr);
 			start_bit = (addr - chunk->start_addr) >> order;
 			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
 			BUG_ON(remain);
@@ -400,7 +405,7 @@ size_t gen_pool_size(struct gen_pool *pool)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
-		size += chunk->end_addr - chunk->start_addr;
+		size += chunk_size(chunk);
 	rcu_read_unlock();
 	return size;
 }

From c33bc315fd921b1179a1d3df5756e0da6fb73944 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 11 Sep 2013 14:21:44 -0700
Subject: [PATCH 080/303] mm: use zone_end_pfn() instead of
 zone_start_pfn+spanned_pages

Use "zone_end_pfn()" instead of "zone->zone_start_pfn + zone->spanned_pages".
Simplify the code, no functional change.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/snapshot.c | 12 ++++++------
 mm/memory_hotplug.c     |  7 ++++---
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 349587bb03e1..358a146fd4da 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 		struct mem_extent *ext, *cur, *aux;
 
 		zone_start = zone->zone_start_pfn;
-		zone_end = zone->zone_start_pfn + zone->spanned_pages;
+		zone_end = zone_end_pfn(zone);
 
 		list_for_each_entry(ext, list, hook)
 			if (zone_start <= ext->end)
@@ -884,7 +884,7 @@ static unsigned int count_highmem_pages(void)
 			continue;
 
 		mark_free_pages(zone);
-		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (saveable_highmem_page(zone, pfn))
 				n++;
@@ -948,7 +948,7 @@ static unsigned int count_data_pages(void)
 			continue;
 
 		mark_free_pages(zone);
-		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (saveable_page(zone, pfn))
 				n++;
@@ -1041,7 +1041,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
 		unsigned long max_zone_pfn;
 
 		mark_free_pages(zone);
-		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (page_is_saveable(zone, pfn))
 				memory_bm_set_bit(orig_bm, pfn);
@@ -1093,7 +1093,7 @@ void swsusp_free(void)
 	unsigned long pfn, max_zone_pfn;
 
 	for_each_populated_zone(zone) {
-		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn)) {
 				struct page *page = pfn_to_page(pfn);
@@ -1755,7 +1755,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
 
 	/* Clear page flags */
 	for_each_populated_zone(zone) {
-		max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
+		max_zone_pfn = zone_end_pfn(zone);
 		for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 			if (pfn_valid(pfn))
 				swsusp_unset_page_free(pfn_to_page(pfn));
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 8e333f953f08..9eadad626d64 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -229,7 +229,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
 
 	zone_span_writelock(zone);
 
-	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	old_zone_end_pfn = zone_end_pfn(zone);
 	if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
 		zone->zone_start_pfn = start_pfn;
 
@@ -514,8 +514,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
 static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
 			     unsigned long end_pfn)
 {
-	unsigned long zone_start_pfn =  zone->zone_start_pfn;
-	unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
+	unsigned long zone_start_pfn = zone->zone_start_pfn;
+	unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
+	unsigned long zone_end_pfn = z;
 	unsigned long pfn;
 	struct mem_section *ms;
 	int nid = zone_to_nid(zone);

From 8080fc038e91265e1002df7cae805fc17bb772fc Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 11 Sep 2013 14:21:45 -0700
Subject: [PATCH 081/303] mm: use zone_is_empty() instead of
 if(zone->spanned_pages)

Use "zone_is_empty()" instead of "if (zone->spanned_pages)".
Simplify the code, no functional change.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 6 +++---
 mm/page_alloc.c     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9eadad626d64..4f5df61d6016 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -230,7 +230,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
 	zone_span_writelock(zone);
 
 	old_zone_end_pfn = zone_end_pfn(zone);
-	if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
+	if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn)
 		zone->zone_start_pfn = start_pfn;
 
 	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -305,7 +305,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
 		goto out_fail;
 
 	/* use start_pfn for z1's start_pfn if z1 is empty */
-	if (z1->spanned_pages)
+	if (!zone_is_empty(z1))
 		z1_start_pfn = z1->zone_start_pfn;
 	else
 		z1_start_pfn = start_pfn;
@@ -347,7 +347,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
 		goto out_fail;
 
 	/* use end_pfn for z2's end_pfn if z2 is empty */
-	if (z2->spanned_pages)
+	if (!zone_is_empty(z2))
 		z2_end_pfn = zone_end_pfn(z2);
 	else
 		z2_end_pfn = end_pfn;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f885eb827159..7c3f8d7e2d8e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1306,7 +1306,7 @@ void mark_free_pages(struct zone *zone)
 	int order, t;
 	struct list_head *curr;
 
-	if (!zone->spanned_pages)
+	if (zone_is_empty(zone))
 		return;
 
 	spin_lock_irqsave(&zone->lock, flags);

From 139c2d75b4e81d449d97f1f8188b84529eb56708 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 11 Sep 2013 14:21:46 -0700
Subject: [PATCH 082/303] mm: use zone_is_initialized() instead of
 if(zone->wait_table)

Use "zone_is_initialized()" instead of "if (zone->wait_table)".
Simplify the code, no functional change.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Cc: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4f5df61d6016..46b489cacdd8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -194,7 +194,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 
 	zone = &pgdat->node_zones[0];
 	for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
-		if (zone->wait_table) {
+		if (zone_is_initialized(zone)) {
 			nr_pages = zone->wait_table_hash_nr_entries
 				* sizeof(wait_queue_head_t);
 			nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;

From 2cad401801978b16ac6e43f10b8d60039670fcbc Mon Sep 17 00:00:00 2001
From: Fengguang Wu <fengguang.wu@intel.com>
Date: Wed, 11 Sep 2013 14:21:47 -0700
Subject: [PATCH 083/303] readahead: make context readahead more conservative

This helps performance on moderately dense random reads on SSD.

Transaction-Per-Second numbers provided by Taobao:

		QPS	case
		-------------------------------------------------------
		7536	disable context readahead totally
w/ patch:	7129	slower size rampup and start RA on the 3rd read
		6717	slower size rampup
w/o patch:	5581	unmodified context readahead

Before, readahead will be started whenever reading page N+1 when it happen
to read N recently.  After patch, we'll only start readahead when *three*
random reads happen to access pages N, N+1, N+2.  The probability of this
happening is extremely low for pure random reads, unless they are very
dense, which actually deserves some readahead.

Also start with a smaller readahead window.  The impact to interleaved
sequential reads should be small, because for a long run stream, the the
small readahead window rampup phase is negletable.

The context readahead actually benefits clustered random reads on HDD
whose seek cost is pretty high.  However as SSD is increasingly used for
random read workloads it's better for the context readahead to concentrate
on interleaved sequential reads.

Another SSD rand read test from Miao

        # file size:        2GB
        # read IO amount: 625MB
        sysbench --test=fileio          \
                --max-requests=10000    \
                --num-threads=1         \
                --file-num=1            \
                --file-block-size=64K   \
                --file-test-mode=rndrd  \
                --file-fsync-freq=0     \
                --file-fsync-end=off    run

shows the performance of btrfs grows up from 69MB/s to 121MB/s, ext4 from
104MB/s to 121MB/s.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Tested-by: Tao Ma <tm@tao.ma>
Tested-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/readahead.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/readahead.c b/mm/readahead.c
index 829a77c62834..e4ed04149785 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping,
 	size = count_history_pages(mapping, ra, offset, max);
 
 	/*
-	 * no history pages:
+	 * not enough history pages:
 	 * it could be a random read
 	 */
-	if (!size)
+	if (size <= req_size)
 		return 0;
 
 	/*
@@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping,
 		size *= 2;
 
 	ra->start = offset;
-	ra->size = get_init_ra_size(size + req_size, max);
-	ra->async_size = ra->size;
+	ra->size = min(size + req_size, max);
+	ra->async_size = 1;
 
 	return 1;
 }

From 15610c86fa83ff778eb80d3cfaa71d6acceb628a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:21:48 -0700
Subject: [PATCH 084/303] hugepage: mention libhugetlbfs in doc

Explicitly mention/recommend using the libhugetlbfs test cases when
changing related kernel code.  Developers that are unaware of the project
can easily miss this and introduce potential regressions that may or may
not be caught by community review.

Also do some cleanups that make the document visually easier to view at a
first glance.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/hugetlbpage.txt | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt
index 4ac359b7aa17..bdd4bb97fff7 100644
--- a/Documentation/vm/hugetlbpage.txt
+++ b/Documentation/vm/hugetlbpage.txt
@@ -165,6 +165,7 @@ which function as described above for the default huge page-sized case.
 
 
 Interaction of Task Memory Policy with Huge Page Allocation/Freeing
+===================================================================
 
 Whether huge pages are allocated and freed via the /proc interface or
 the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA
@@ -229,6 +230,7 @@ resulting effect on persistent huge page allocation is as follows:
    of huge pages over all on-lines nodes with memory.
 
 Per Node Hugepages Attributes
+=============================
 
 A subset of the contents of the root huge page control directory in sysfs,
 described above, will be replicated under each the system device of each
@@ -258,6 +260,7 @@ applied, from which node the huge page allocation will be attempted.
 
 
 Using Huge Pages
+================
 
 If the user applications are going to request huge pages using mmap system
 call, then it is required that system administrator mount a file system of
@@ -296,20 +299,16 @@ calls, though the mount of filesystem will be required for using mmap calls
 without MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see
 map_hugetlb.c.
 
-*******************************************************************
+Examples
+========
 
-/*
- * map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
- */
+1) map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c
 
-*******************************************************************
+2) hugepage-shm:  see tools/testing/selftests/vm/hugepage-shm.c
 
-/*
- * hugepage-shm:  see tools/testing/selftests/vm/hugepage-shm.c
- */
+3) hugepage-mmap:  see tools/testing/selftests/vm/hugepage-mmap.c
 
-*******************************************************************
-
-/*
- * hugepage-mmap:  see tools/testing/selftests/vm/hugepage-mmap.c
- */
+4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a
+   wide range of userspace tools to help with huge page usability, environment
+   setup, and control. Furthermore it provides useful test cases that should be
+   used when modifying code to ensure no regressions are introduced.

From 27356f54c8c32609ff45b4ed333bb64fb2eef374 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Wed, 11 Sep 2013 14:21:49 -0700
Subject: [PATCH 085/303] mm/hotplug: verify hotplug memory range

add_memory() and remove_memory() can only handle a memory range aligned
with section.  There are problems when an unaligned range is added and
then deleted as follows:

 - add_memory() with an unaligned range succeeds, but __add_pages()
   called from add_memory() adds a whole section of pages even though
   a given memory range is less than the section size.
 - remove_memory() to the added unaligned range hits BUG_ON() in
   __remove_pages().

This patch changes add_memory() and remove_memory() to check if a given
memory range is aligned with section at the beginning.  As the result,
add_memory() fails with -EINVAL when a given range is unaligned, and does
not add such memory range.  This prevents remove_memory() to be called
with an unaligned range as well.  Note that remove_memory() has to use
BUG_ON() since this function cannot fail.

[akpm@linux-foundation.org: avoid printk warnings]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Tang Chen <tangchen@cn.fujitsu.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 46b489cacdd8..247d66675a91 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1070,6 +1070,23 @@ out:
 	return ret;
 }
 
+static int check_hotplug_memory_range(u64 start, u64 size)
+{
+	u64 start_pfn = start >> PAGE_SHIFT;
+	u64 nr_pages = size >> PAGE_SHIFT;
+
+	/* Memory range must be aligned with section */
+	if ((start_pfn & ~PAGE_SECTION_MASK) ||
+	    (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) {
+		pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n",
+				(unsigned long long)start,
+				(unsigned long long)size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 int __ref add_memory(int nid, u64 start, u64 size)
 {
@@ -1079,6 +1096,10 @@ int __ref add_memory(int nid, u64 start, u64 size)
 	struct resource *res;
 	int ret;
 
+	ret = check_hotplug_memory_range(start, size);
+	if (ret)
+		return ret;
+
 	lock_memory_hotplug();
 
 	res = register_memory_resource(start, size);
@@ -1786,6 +1807,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
 {
 	int ret;
 
+	BUG_ON(check_hotplug_memory_range(start, size));
+
 	lock_memory_hotplug();
 
 	/*

From 0f1cfe9d0d06fe44c2b310401d2db101968e8c58 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Wed, 11 Sep 2013 14:21:50 -0700
Subject: [PATCH 086/303] mm/hotplug: remove stop_machine() from
 try_offline_node()

lock_device_hotplug() serializes hotplug & online/offline operations.  The
lock is held in common sysfs online/offline interfaces and ACPI hotplug
code paths.

And here are the code paths:

- CPU & Mem online/offline via sysfs online
	store_online()->lock_device_hotplug()

- Mem online via sysfs state:
	store_mem_state()->lock_device_hotplug()

- ACPI CPU & Mem hot-add:
	acpi_scan_bus_device_check()->lock_device_hotplug()

- ACPI CPU & Mem hot-delete:
	acpi_scan_hot_remove()->lock_device_hotplug()

try_offline_node() off-lines a node if all memory sections and cpus are
removed on the node.  It is called from acpi_processor_remove() and
acpi_memory_remove_memory()->remove_memory() paths, both of which are in
the ACPI hotplug code.

try_offline_node() calls stop_machine() to stop all cpus while checking
all cpu status with the assumption that the caller is not protected from
CPU hotplug or CPU online/offline operations.  However, the caller is
always serialized with lock_device_hotplug().  Also, the code needs to be
properly serialized with a lock, not by stopping all cpus at a random
place with stop_machine().

This patch removes the use of stop_machine() in try_offline_node() and
adds comments to try_offline_node() and remove_memory() that
lock_device_hotplug() is required.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 247d66675a91..d595606728f9 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1695,9 +1695,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
 	return ret;
 }
 
-static int check_cpu_on_node(void *data)
+static int check_cpu_on_node(pg_data_t *pgdat)
 {
-	struct pglist_data *pgdat = data;
 	int cpu;
 
 	for_each_present_cpu(cpu) {
@@ -1712,10 +1711,9 @@ static int check_cpu_on_node(void *data)
 	return 0;
 }
 
-static void unmap_cpu_on_node(void *data)
+static void unmap_cpu_on_node(pg_data_t *pgdat)
 {
 #ifdef CONFIG_ACPI_NUMA
-	struct pglist_data *pgdat = data;
 	int cpu;
 
 	for_each_possible_cpu(cpu)
@@ -1724,10 +1722,11 @@ static void unmap_cpu_on_node(void *data)
 #endif
 }
 
-static int check_and_unmap_cpu_on_node(void *data)
+static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
 {
-	int ret = check_cpu_on_node(data);
+	int ret;
 
+	ret = check_cpu_on_node(pgdat);
 	if (ret)
 		return ret;
 
@@ -1736,11 +1735,18 @@ static int check_and_unmap_cpu_on_node(void *data)
 	 * the cpu_to_node() now.
 	 */
 
-	unmap_cpu_on_node(data);
+	unmap_cpu_on_node(pgdat);
 	return 0;
 }
 
-/* offline the node if all memory sections of this node are removed */
+/**
+ * try_offline_node
+ *
+ * Offline a node if all memory sections and cpus of the node are removed.
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call.
+ */
 void try_offline_node(int nid)
 {
 	pg_data_t *pgdat = NODE_DATA(nid);
@@ -1766,7 +1772,7 @@ void try_offline_node(int nid)
 		return;
 	}
 
-	if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL))
+	if (check_and_unmap_cpu_on_node(pgdat))
 		return;
 
 	/*
@@ -1803,6 +1809,13 @@ void try_offline_node(int nid)
 }
 EXPORT_SYMBOL(try_offline_node);
 
+/**
+ * remove_memory
+ *
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations before this call, as required by
+ * try_offline_node().
+ */
 void __ref remove_memory(int nid, u64 start, u64 size)
 {
 	int ret;

From 4ef91848043679b272a1a5b8e2879acf696ba9e2 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:51 -0700
Subject: [PATCH 087/303] mm, hugetlb: protect reserved pages when soft
 offlining a hugepage

Don't use the reserve pool when soft offlining a hugepage.  Check we have
free pages outside the reserve pool before we dequeue the huge page.
Otherwise, we can steal other's reserve page.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5b084c7b34c6..583db1948145 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -955,10 +955,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
  */
 struct page *alloc_huge_page_node(struct hstate *h, int nid)
 {
-	struct page *page;
+	struct page *page = NULL;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page_node(h, nid);
+	if (h->free_huge_pages - h->resv_huge_pages > 0)
+		page = dequeue_huge_page_node(h, nid);
 	spin_unlock(&hugetlb_lock);
 
 	if (!page)

From f522c3ac00a49128115f99a5fcb95a447601c1c3 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:53 -0700
Subject: [PATCH 088/303] mm, hugetlb: change variable name reservations to
 resv

'reservations' is so long name as a variable and we use 'resv_map' to
represent 'struct resv_map' in other place.  To reduce confusion and
unreadability, change it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 583db1948145..204550ae29c8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1115,9 +1115,9 @@ static long vma_needs_reservation(struct hstate *h,
 	} else  {
 		long err;
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-		struct resv_map *reservations = vma_resv_map(vma);
+		struct resv_map *resv = vma_resv_map(vma);
 
-		err = region_chg(&reservations->regions, idx, idx + 1);
+		err = region_chg(&resv->regions, idx, idx + 1);
 		if (err < 0)
 			return err;
 		return 0;
@@ -1135,10 +1135,10 @@ static void vma_commit_reservation(struct hstate *h,
 
 	} else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
 		pgoff_t idx = vma_hugecache_offset(h, vma, addr);
-		struct resv_map *reservations = vma_resv_map(vma);
+		struct resv_map *resv = vma_resv_map(vma);
 
 		/* Mark this page used in the map. */
-		region_add(&reservations->regions, idx, idx + 1);
+		region_add(&resv->regions, idx, idx + 1);
 	}
 }
 
@@ -2188,7 +2188,7 @@ out:
 
 static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 {
-	struct resv_map *reservations = vma_resv_map(vma);
+	struct resv_map *resv = vma_resv_map(vma);
 
 	/*
 	 * This new VMA should share its siblings reservation map if present.
@@ -2198,34 +2198,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
 	 * after this open call completes.  It is therefore safe to take a
 	 * new reference here without additional locking.
 	 */
-	if (reservations)
-		kref_get(&reservations->refs);
+	if (resv)
+		kref_get(&resv->refs);
 }
 
 static void resv_map_put(struct vm_area_struct *vma)
 {
-	struct resv_map *reservations = vma_resv_map(vma);
+	struct resv_map *resv = vma_resv_map(vma);
 
-	if (!reservations)
+	if (!resv)
 		return;
-	kref_put(&reservations->refs, resv_map_release);
+	kref_put(&resv->refs, resv_map_release);
 }
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
 	struct hstate *h = hstate_vma(vma);
-	struct resv_map *reservations = vma_resv_map(vma);
+	struct resv_map *resv = vma_resv_map(vma);
 	struct hugepage_subpool *spool = subpool_vma(vma);
 	unsigned long reserve;
 	unsigned long start;
 	unsigned long end;
 
-	if (reservations) {
+	if (resv) {
 		start = vma_hugecache_offset(h, vma, vma->vm_start);
 		end = vma_hugecache_offset(h, vma, vma->vm_end);
 
 		reserve = (end - start) -
-			region_count(&reservations->regions, start, end);
+			region_count(&resv->regions, start, end);
 
 		resv_map_put(vma);
 

From 8bb3f12e7d4f7b043a7c5aa3831e72041e80dc4a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:54 -0700
Subject: [PATCH 089/303] mm, hugetlb: fix subpool accounting handling

If we alloc hugepage with avoid_reserve, we don't dequeue reserved one.
So, we should check subpool counter when avoid_reserve.  This patch
implement it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 204550ae29c8..dec5772c8c5c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1164,13 +1164,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	chg = vma_needs_reservation(h, vma, addr);
 	if (chg < 0)
 		return ERR_PTR(-ENOMEM);
-	if (chg)
-		if (hugepage_subpool_get_pages(spool, chg))
+	if (chg || avoid_reserve)
+		if (hugepage_subpool_get_pages(spool, 1))
 			return ERR_PTR(-ENOSPC);
 
 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
 	if (ret) {
-		hugepage_subpool_put_pages(spool, chg);
+		if (chg || avoid_reserve)
+			hugepage_subpool_put_pages(spool, 1);
 		return ERR_PTR(-ENOSPC);
 	}
 	spin_lock(&hugetlb_lock);
@@ -1182,7 +1183,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 			hugetlb_cgroup_uncharge_cgroup(idx,
 						       pages_per_huge_page(h),
 						       h_cg);
-			hugepage_subpool_put_pages(spool, chg);
+			if (chg || avoid_reserve)
+				hugepage_subpool_put_pages(spool, 1);
 			return ERR_PTR(-ENOSPC);
 		}
 		spin_lock(&hugetlb_lock);

From 5944d0116c773319a48ea6812d1891aa6d0bbbbf Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:55 -0700
Subject: [PATCH 090/303] mm, hugetlb: remove useless check about mapping type

is_vma_resv_set(vma, HPAGE_RESV_OWNER) implys that this mapping is for
private.  So we don't need to check whether this mapping is for shared or
not.

This patch is just for clean-up.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dec5772c8c5c..f6347ec4fd0a 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2564,8 +2564,7 @@ retry_avoidcopy:
 	 * at the time of fork() could consume its reserves on COW instead
 	 * of the full address range.
 	 */
-	if (!(vma->vm_flags & VM_MAYSHARE) &&
-			is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
+	if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) &&
 			old_page != pagecache_page)
 		outside_reserve = 1;
 

From 8312034f3604bc0339c40545c538116f4ddad152 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:57 -0700
Subject: [PATCH 091/303] mm, hugetlb: grab a page_table_lock after
 page_cache_release

We don't need to grab a page_table_lock when we try to release a page.
So, defer to grab a page_table_lock.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Reviewed-by: Davidlohr Bueso <davidlohr@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f6347ec4fd0a..5bf6468a8862 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2647,10 +2647,11 @@ retry_avoidcopy:
 	}
 	spin_unlock(&mm->page_table_lock);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	/* Caller expects lock to be held */
-	spin_lock(&mm->page_table_lock);
 	page_cache_release(new_page);
 	page_cache_release(old_page);
+
+	/* Caller expects lock to be held */
+	spin_lock(&mm->page_table_lock);
 	return 0;
 }
 

From 07443a85ad90c7b62fbe11dcd3d6a1de1e10516f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Wed, 11 Sep 2013 14:21:58 -0700
Subject: [PATCH 092/303] mm, hugetlb: return a reserved page to a reserved
 pool if failed

If we fail with a reserved page, just calling put_page() is not
sufficient, because put_page() invoke free_huge_page() at last step and it
doesn't know whether a page comes from a reserved pool or not.  So it
doesn't do anything related to reserved count.  This makes reserve count
lower than how we need, because reserve count already decrease in
dequeue_huge_page_vma().  This patch fix this situation.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5bf6468a8862..06315560bd23 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -572,6 +572,7 @@ retry_cpuset:
 				if (!vma_has_reserves(vma, chg))
 					break;
 
+				SetPagePrivate(page);
 				h->resv_huge_pages--;
 				break;
 			}
@@ -629,15 +630,20 @@ static void free_huge_page(struct page *page)
 	int nid = page_to_nid(page);
 	struct hugepage_subpool *spool =
 		(struct hugepage_subpool *)page_private(page);
+	bool restore_reserve;
 
 	set_page_private(page, 0);
 	page->mapping = NULL;
 	BUG_ON(page_count(page));
 	BUG_ON(page_mapcount(page));
+	restore_reserve = PagePrivate(page);
 
 	spin_lock(&hugetlb_lock);
 	hugetlb_cgroup_uncharge_page(hstate_index(h),
 				     pages_per_huge_page(h), page);
+	if (restore_reserve)
+		h->resv_huge_pages++;
+
 	if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) {
 		/* remove the page from active list */
 		list_del(&page->lru);
@@ -2636,6 +2642,8 @@ retry_avoidcopy:
 	spin_lock(&mm->page_table_lock);
 	ptep = huge_pte_offset(mm, address & huge_page_mask(h));
 	if (likely(pte_same(huge_ptep_get(ptep), pte))) {
+		ClearPagePrivate(new_page);
+
 		/* Break COW */
 		huge_ptep_clear_flush(vma, address, ptep);
 		set_huge_pte_at(mm, address, ptep,
@@ -2747,6 +2755,7 @@ retry:
 					goto retry;
 				goto out;
 			}
+			ClearPagePrivate(page);
 
 			spin_lock(&inode->i_lock);
 			inode->i_blocks += blocks_per_huge_page(h);
@@ -2793,8 +2802,10 @@ retry:
 	if (!huge_pte_none(huge_ptep_get(ptep)))
 		goto backout;
 
-	if (anon_rmap)
+	if (anon_rmap) {
+		ClearPagePrivate(page);
 		hugepage_add_new_anon_rmap(page, vma, address);
+	}
 	else
 		page_dup_rmap(page);
 	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)

From 31caf665e666b51fe36efd1e54031ed29e86c0b4 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:21:59 -0700
Subject: [PATCH 093/303] mm: migrate: make core migration code aware of
 hugepage

Currently hugepage migration is available only for soft offlining, but
it's also useful for some other users of page migration (clearly because
users of hugepage can enjoy the benefit of mempolicy and memory hotplug.)
So this patchset tries to extend such users to support hugepage migration.

The target of this patchset is to enable hugepage migration for NUMA
related system calls (migrate_pages(2), move_pages(2), and mbind(2)), and
memory hotplug.

This patchset does not add hugepage migration for memory compaction,
because users of memory compaction mainly expect to construct thp by
arranging raw pages, and there's little or no need to compact hugepages.
CMA, another user of page migration, can have benefit from hugepage
migration, but is not enabled to support it for now (just because of lack
of testing and expertise in CMA.)

Hugepage migration of non pmd-based hugepage (for example 1GB hugepage in
x86_64, or hugepages in architectures like ia64) is not enabled for now
(again, because of lack of testing.)

As for how these are achived, I extended the API (migrate_pages()) to
handle hugepage (with patch 1 and 2) and adjusted code of each caller to
check and collect movable hugepages (with patch 3-7).  Remaining 2 patches
are kind of miscellaneous ones to avoid unexpected behavior.  Patch 8 is
about making sure that we only migrate pmd-based hugepages.  And patch 9
is about choosing appropriate zone for hugepage allocation.

My test is mainly functional one, simply kicking hugepage migration via
each entry point and confirm that migration is done correctly.  Test code
is available here:

  git://github.com/Naoya-Horiguchi/test_hugepage_migration_extension.git

And I always run libhugetlbfs test when changing hugetlbfs's code.  With
this patchset, no regression was found in the test.

This patch (of 9):

Before enabling each user of page migration to support hugepage,
this patch enables the list of pages for migration to link not only
LRU pages, but also hugepages. As a result, putback_movable_pages()
and migrate_pages() can handle both of LRU pages and hugepages.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++++
 mm/hugetlb.c            | 23 ++++++++++++++++++++++-
 mm/migrate.c            | 10 +++++++++-
 3 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c2b1801a160b..bc8d8370cd0d 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -66,6 +66,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
 						vm_flags_t vm_flags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
+bool isolate_huge_page(struct page *page, struct list_head *list);
+void putback_active_hugepage(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -134,6 +136,8 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 	return 0;
 }
 
+#define isolate_huge_page(p, l) false
+#define putback_active_hugepage(p)	do {} while (0)
 static inline void copy_huge_page(struct page *dst, struct page *src)
 {
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 06315560bd23..e51723866fb1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
 
 /*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
+ * free_huge_pages, and surplus_huge_pages.
  */
 DEFINE_SPINLOCK(hugetlb_lock);
 
@@ -3422,3 +3423,23 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 	return ret;
 }
 #endif
+
+bool isolate_huge_page(struct page *page, struct list_head *list)
+{
+	VM_BUG_ON(!PageHead(page));
+	if (!get_page_unless_zero(page))
+		return false;
+	spin_lock(&hugetlb_lock);
+	list_move_tail(&page->lru, list);
+	spin_unlock(&hugetlb_lock);
+	return true;
+}
+
+void putback_active_hugepage(struct page *page)
+{
+	VM_BUG_ON(!PageHead(page));
+	spin_lock(&hugetlb_lock);
+	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
+	spin_unlock(&hugetlb_lock);
+	put_page(page);
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 6f0c24438bba..b44a067fee10 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l)
 	struct page *page2;
 
 	list_for_each_entry_safe(page, page2, l, lru) {
+		if (unlikely(PageHuge(page))) {
+			putback_active_hugepage(page);
+			continue;
+		}
 		list_del(&page->lru);
 		dec_zone_page_state(page, NR_ISOLATED_ANON +
 				page_is_file_cache(page));
@@ -1025,7 +1029,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 		list_for_each_entry_safe(page, page2, from, lru) {
 			cond_resched();
 
-			rc = unmap_and_move(get_new_page, private,
+			if (PageHuge(page))
+				rc = unmap_and_move_huge_page(get_new_page,
+						private, page, pass > 2, mode);
+			else
+				rc = unmap_and_move(get_new_page, private,
 						page, pass > 2, mode);
 
 			switch(rc) {

From b8ec1cee5a4375c1244b85709138a2eac2d89cb6 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:01 -0700
Subject: [PATCH 094/303] mm: soft-offline: use migrate_pages() instead of
 migrate_huge_page()

Currently migrate_huge_page() takes a pointer to a hugepage to be migrated
as an argument, instead of taking a pointer to the list of hugepages to be
migrated.  This behavior was introduced in commit 189ebff28 ("hugetlb:
simplify migrate_huge_page()"), and was OK because until now hugepage
migration is enabled only for soft-offlining which migrates only one
hugepage in a single call.

But the situation will change in the later patches in this series which
enable other users of page migration to support hugepage migration.  They
can kick migration for both of normal pages and hugepages in a single
call, so we need to go back to original implementation which uses linked
lists to collect the hugepages to be migrated.

With this patch, soft_offline_huge_page() switches to use migrate_pages(),
and migrate_huge_page() is not used any more.  So let's remove it.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/migrate.h |  5 -----
 mm/memory-failure.c     | 15 ++++++++++++---
 mm/migrate.c            | 28 ++--------------------------
 3 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc0f61..6fe521420631 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -41,8 +41,6 @@ extern int migrate_page(struct address_space *,
 			struct page *, struct page *, enum migrate_mode);
 extern int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, enum migrate_mode mode, int reason);
-extern int migrate_huge_page(struct page *, new_page_t x,
-		unsigned long private, enum migrate_mode mode);
 
 extern int fail_migrate_page(struct address_space *,
 			struct page *, struct page *);
@@ -62,9 +60,6 @@ static inline void putback_movable_pages(struct list_head *l) {}
 static inline int migrate_pages(struct list_head *l, new_page_t x,
 		unsigned long private, enum migrate_mode mode, int reason)
 	{ return -ENOSYS; }
-static inline int migrate_huge_page(struct page *page, new_page_t x,
-		unsigned long private, enum migrate_mode mode)
-	{ return -ENOSYS; }
 
 static inline int migrate_prep(void) { return -ENOSYS; }
 static inline int migrate_prep_local(void) { return -ENOSYS; }
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d84c5e5331bb..e05ed31c0f61 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1470,6 +1470,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	int ret;
 	unsigned long pfn = page_to_pfn(page);
 	struct page *hpage = compound_head(page);
+	LIST_HEAD(pagelist);
 
 	/*
 	 * This double-check of PageHWPoison is to avoid the race with
@@ -1485,12 +1486,20 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	unlock_page(hpage);
 
 	/* Keep page count to indicate a given hugepage is isolated. */
-	ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL,
-				MIGRATE_SYNC);
-	put_page(hpage);
+	list_move(&hpage->lru, &pagelist);
+	ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL,
+				MIGRATE_SYNC, MR_MEMORY_FAILURE);
 	if (ret) {
 		pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
 			pfn, ret, page->flags);
+		/*
+		 * We know that soft_offline_huge_page() tries to migrate
+		 * only one hugepage pointed to by hpage, so we need not
+		 * run through the pagelist here.
+		 */
+		putback_active_hugepage(hpage);
+		if (ret > 0)
+			ret = -EIO;
 	} else {
 		set_page_hwpoison_huge_page(hpage);
 		dequeue_hwpoisoned_huge_page(hpage);
diff --git a/mm/migrate.c b/mm/migrate.c
index b44a067fee10..3ec47d3394c8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -979,6 +979,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 
 	unlock_page(hpage);
 out:
+	if (rc != -EAGAIN)
+		putback_active_hugepage(hpage);
 	put_page(new_hpage);
 	if (result) {
 		if (rc)
@@ -1066,32 +1068,6 @@ out:
 	return rc;
 }
 
-int migrate_huge_page(struct page *hpage, new_page_t get_new_page,
-		      unsigned long private, enum migrate_mode mode)
-{
-	int pass, rc;
-
-	for (pass = 0; pass < 10; pass++) {
-		rc = unmap_and_move_huge_page(get_new_page, private,
-						hpage, pass > 2, mode);
-		switch (rc) {
-		case -ENOMEM:
-			goto out;
-		case -EAGAIN:
-			/* try again */
-			cond_resched();
-			break;
-		case MIGRATEPAGE_SUCCESS:
-			goto out;
-		default:
-			rc = -EIO;
-			goto out;
-		}
-	}
-out:
-	return rc;
-}
-
 #ifdef CONFIG_NUMA
 /*
  * Move a list of individual pages

From e2d8cf405525d83e6ca42969be460f94b0339798 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:03 -0700
Subject: [PATCH 095/303] migrate: add hugepage migration code to
 migrate_pages()

Extend check_range() to handle vma with VM_HUGETLB set.  We will be able
to migrate hugepage with migrate_pages(2) after applying the enablement
patch which comes later in this series.

Note that for larger hugepages (covered by pud entries, 1GB for x86_64 for
example), we simply skip it now.

Note that using pmd_huge/pud_huge assumes that hugepages are pointed to by
pmd/pud.  This is not true in some architectures implementing hugepage
with other mechanisms like ia64, but it's OK because pmd_huge/pud_huge
simply return 0 in such arch and page walker simply ignores such
hugepages.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 44 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 39 insertions(+), 5 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27022ca890f8..4626be621e74 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -515,6 +515,30 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	return addr != end;
 }
 
+static void check_hugetlb_pmd_range(struct vm_area_struct *vma, pmd_t *pmd,
+		const nodemask_t *nodes, unsigned long flags,
+				    void *private)
+{
+#ifdef CONFIG_HUGETLB_PAGE
+	int nid;
+	struct page *page;
+
+	spin_lock(&vma->vm_mm->page_table_lock);
+	page = pte_page(huge_ptep_get((pte_t *)pmd));
+	nid = page_to_nid(page);
+	if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+		goto unlock;
+	/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
+	if (flags & (MPOL_MF_MOVE_ALL) ||
+	    (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
+		isolate_huge_page(page, private);
+unlock:
+	spin_unlock(&vma->vm_mm->page_table_lock);
+#else
+	BUG();
+#endif
+}
+
 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
@@ -526,6 +550,13 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
+		if (!pmd_present(*pmd))
+			continue;
+		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
+			check_hugetlb_pmd_range(vma, pmd, nodes,
+						flags, private);
+			continue;
+		}
 		split_huge_page_pmd(vma, addr, pmd);
 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			continue;
@@ -547,6 +578,8 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
+		if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
+			continue;
 		if (pud_none_or_clear_bad(pud))
 			continue;
 		if (check_pmd_range(vma, pud, addr, next, nodes,
@@ -638,9 +671,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 				return ERR_PTR(-EFAULT);
 		}
 
-		if (is_vm_hugetlb_page(vma))
-			goto next;
-
 		if (flags & MPOL_MF_LAZY) {
 			change_prot_numa(vma, start, endvma);
 			goto next;
@@ -993,7 +1023,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
 
 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 {
-	return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
+	if (PageHuge(page))
+		return alloc_huge_page_node(page_hstate(compound_head(page)),
+					node);
+	else
+		return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 }
 
 /*
@@ -1023,7 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 		err = migrate_pages(&pagelist, new_node_page, dest,
 					MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
-			putback_lru_pages(&pagelist);
+			putback_movable_pages(&pagelist);
 	}
 
 	return err;

From e632a938d914d271bec26e570d36c755a1e35e4c Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:04 -0700
Subject: [PATCH 096/303] mm: migrate: add hugepage migration code to
 move_pages()

Extend move_pages() to handle vma with VM_HUGETLB set.  We will be able to
migrate hugepage with move_pages(2) after applying the enablement patch
which comes later in this series.

We avoid getting refcount on tail pages of hugepage, because unlike thp,
hugepage is not split and we need not care about races with splitting.

And migration of larger (1GB for x86_64) hugepage are not enabled.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c  | 17 +++++++++++++++--
 mm/migrate.c | 13 +++++++++++--
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index c1c6d59b2b03..2b73dbde2274 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1481,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pud_none(*pud))
 		goto no_page_table;
 	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
-		BUG_ON(flags & FOLL_GET);
+		if (flags & FOLL_GET)
+			goto out;
 		page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
 		goto out;
 	}
@@ -1492,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
 	if (pmd_none(*pmd))
 		goto no_page_table;
 	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
-		BUG_ON(flags & FOLL_GET);
 		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
+		if (flags & FOLL_GET) {
+			/*
+			 * Refcount on tail pages are not well-defined and
+			 * shouldn't be taken. The caller should handle a NULL
+			 * return when trying to follow tail pages.
+			 */
+			if (PageHead(page))
+				get_page(page);
+			else {
+				page = NULL;
+				goto out;
+			}
+		}
 		goto out;
 	}
 	if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
diff --git a/mm/migrate.c b/mm/migrate.c
index 3ec47d3394c8..d3137375fa80 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1092,7 +1092,11 @@ static struct page *new_page_node(struct page *p, unsigned long private,
 
 	*result = &pm->status;
 
-	return alloc_pages_exact_node(pm->node,
+	if (PageHuge(p))
+		return alloc_huge_page_node(page_hstate(compound_head(p)),
+					pm->node);
+	else
+		return alloc_pages_exact_node(pm->node,
 				GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0);
 }
 
@@ -1152,6 +1156,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
 				!migrate_all)
 			goto put_and_set;
 
+		if (PageHuge(page)) {
+			isolate_huge_page(page, &pagelist);
+			goto put_and_set;
+		}
+
 		err = isolate_lru_page(page);
 		if (!err) {
 			list_add_tail(&page->lru, &pagelist);
@@ -1174,7 +1183,7 @@ set_status:
 		err = migrate_pages(&pagelist, new_page_node,
 				(unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL);
 		if (err)
-			putback_lru_pages(&pagelist);
+			putback_movable_pages(&pagelist);
 	}
 
 	up_read(&mm->mmap_sem);

From 74060e4d78795c7c43805133cb717d82533d4e0d Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:06 -0700
Subject: [PATCH 097/303] mm: mbind: add hugepage migration code to mbind()

Extend do_mbind() to handle vma with VM_HUGETLB set.  We will be able to
migrate hugepage with mbind(2) after applying the enablement patch which
comes later in this series.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb.c            | 14 ++++++++++++++
 mm/mempolicy.c          |  4 +++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index bc8d8370cd0d..d1db00790a84 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -265,6 +265,8 @@ struct huge_bootmem_page {
 };
 
 struct page *alloc_huge_page_node(struct hstate *h, int nid);
+struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
+				unsigned long addr, int avoid_reserve);
 
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
@@ -378,6 +380,7 @@ static inline pgoff_t basepage_index(struct page *page)
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
+#define alloc_huge_page_noerr(v, a, r) NULL
 #define alloc_bootmem_huge_page(h) NULL
 #define hstate_file(f) NULL
 #define hstate_sizelog(s) NULL
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e51723866fb1..d37b3b95c439 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1207,6 +1207,20 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
+/*
+ * alloc_huge_page()'s wrapper which simply returns the page if allocation
+ * succeeds, otherwise NULL. This function is called from new_vma_page(),
+ * where no ERR_VALUE is expected to be returned.
+ */
+struct page *alloc_huge_page_noerr(struct vm_area_struct *vma,
+				unsigned long addr, int avoid_reserve)
+{
+	struct page *page = alloc_huge_page(vma, addr, avoid_reserve);
+	if (IS_ERR(page))
+		page = NULL;
+	return page;
+}
+
 int __weak alloc_bootmem_huge_page(struct hstate *h)
 {
 	struct huge_bootmem_page *m;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4626be621e74..c7c359213ae1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1192,6 +1192,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 		vma = vma->vm_next;
 	}
 
+	if (PageHuge(page))
+		return alloc_huge_page_noerr(vma, address, 1);
 	/*
 	 * if !vma, alloc_page_vma() will use task or system default policy
 	 */
@@ -1302,7 +1304,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 					(unsigned long)vma,
 					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
-				putback_lru_pages(&pagelist);
+				putback_movable_pages(&pagelist);
 		}
 
 		if (nr_failed && (flags & MPOL_MF_STRICT))

From 71ea2efb1e936a127690a0a540b3a6162f95e48a Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:08 -0700
Subject: [PATCH 098/303] mm: migrate: remove VM_HUGETLB from vma flag check in
 vma_migratable()

Enable hugepage migration from migrate_pages(2), move_pages(2), and
mbind(2).

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Hillf Danton <dhillf@gmail.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index b2f897789838..da6716b9e3fe 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -173,7 +173,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 /* Check if a vma is migratable */
 static inline int vma_migratable(struct vm_area_struct *vma)
 {
-	if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP))
+	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
 		return 0;
 	/*
 	 * Migration allocates pages in the highest zone. If we cannot

From c8721bbbdd36382de51cd6b7a56322e0acca2414 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:09 -0700
Subject: [PATCH 099/303] mm: memory-hotplug: enable memory hotplug to handle
 hugepage

Until now we can't offline memory blocks which contain hugepages because a
hugepage is considered as an unmovable page.  But now with this patch
series, a hugepage has become movable, so by using hugepage migration we
can offline such memory blocks.

What's different from other users of hugepage migration is that we need to
decompose all the hugepages inside the target memory block into free buddy
pages after hugepage migration, because otherwise free hugepages remaining
in the memory block intervene the memory offlining.  For this reason we
introduce new functions dissolve_free_huge_page() and
dissolve_free_huge_pages().

Other than that, what this patch does is straightforwardly to add hugepage
migration code, that is, adding hugepage code to the functions which scan
over pfn and collect hugepages to be migrated, and adding a hugepage
allocation function to alloc_migrate_target().

As for larger hugepages (1GB for x86_64), it's not easy to do hotremove
over them because it's larger than memory block.  So we now simply leave
it to fail as it is.

[yongjun_wei@trendmicro.com.cn: remove duplicated include]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  6 ++++
 mm/hugetlb.c            | 71 +++++++++++++++++++++++++++++++++++++++--
 mm/memory_hotplug.c     | 42 ++++++++++++++++++++----
 mm/page_alloc.c         | 11 +++++++
 mm/page_isolation.c     | 14 ++++++++
 5 files changed, 135 insertions(+), 9 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index d1db00790a84..2e02c4ed1035 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -68,6 +68,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 int dequeue_hwpoisoned_huge_page(struct page *page);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
+bool is_hugepage_active(struct page *page);
 void copy_huge_page(struct page *dst, struct page *src);
 
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -138,6 +139,7 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page)
 
 #define isolate_huge_page(p, l) false
 #define putback_active_hugepage(p)	do {} while (0)
+#define is_hugepage_active(x)	false
 static inline void copy_huge_page(struct page *dst, struct page *src)
 {
 }
@@ -377,6 +379,9 @@ static inline pgoff_t basepage_index(struct page *page)
 	return __basepage_index(page);
 }
 
+extern void dissolve_free_huge_pages(unsigned long start_pfn,
+				     unsigned long end_pfn);
+
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
@@ -403,6 +408,7 @@ static inline pgoff_t basepage_index(struct page *page)
 {
 	return page->index;
 }
+#define dissolve_free_huge_pages(s, e)	do {} while (0)
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 #endif /* _LINUX_HUGETLB_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d37b3b95c439..fb4293b93fd0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -21,6 +21,7 @@
 #include <linux/rmap.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
+#include <linux/page-isolation.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -522,9 +523,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 {
 	struct page *page;
 
-	if (list_empty(&h->hugepage_freelists[nid]))
+	list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
+		if (!is_migrate_isolate_page(page))
+			break;
+	/*
+	 * if 'non-isolated free hugepage' not found on the list,
+	 * the allocation fails.
+	 */
+	if (&h->hugepage_freelists[nid] == &page->lru)
 		return NULL;
-	page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
 	list_move(&page->lru, &h->hugepage_activelist);
 	set_page_refcounted(page);
 	h->free_huge_pages--;
@@ -878,6 +885,44 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 	return ret;
 }
 
+/*
+ * Dissolve a given free hugepage into free buddy pages. This function does
+ * nothing for in-use (including surplus) hugepages.
+ */
+static void dissolve_free_huge_page(struct page *page)
+{
+	spin_lock(&hugetlb_lock);
+	if (PageHuge(page) && !page_count(page)) {
+		struct hstate *h = page_hstate(page);
+		int nid = page_to_nid(page);
+		list_del(&page->lru);
+		h->free_huge_pages--;
+		h->free_huge_pages_node[nid]--;
+		update_and_free_page(h, page);
+	}
+	spin_unlock(&hugetlb_lock);
+}
+
+/*
+ * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
+ * make specified memory blocks removable from the system.
+ * Note that start_pfn should aligned with (minimum) hugepage size.
+ */
+void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+{
+	unsigned int order = 8 * sizeof(void *);
+	unsigned long pfn;
+	struct hstate *h;
+
+	/* Set scan step to minimum hugepage size */
+	for_each_hstate(h)
+		if (order > huge_page_order(h))
+			order = huge_page_order(h);
+	VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order));
+	for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order)
+		dissolve_free_huge_page(pfn_to_page(pfn));
+}
+
 static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 {
 	struct page *page;
@@ -3457,3 +3502,25 @@ void putback_active_hugepage(struct page *page)
 	spin_unlock(&hugetlb_lock);
 	put_page(page);
 }
+
+bool is_hugepage_active(struct page *page)
+{
+	VM_BUG_ON(!PageHuge(page));
+	/*
+	 * This function can be called for a tail page because the caller,
+	 * scan_movable_pages, scans through a given pfn-range which typically
+	 * covers one memory block. In systems using gigantic hugepage (1GB
+	 * for x86_64,) a hugepage is larger than a memory block, and we don't
+	 * support migrating such large hugepages for now, so return false
+	 * when called for tail pages.
+	 */
+	if (PageTail(page))
+		return false;
+	/*
+	 * Refcount of a hwpoisoned hugepages is 1, but they are not active,
+	 * so we should return false for them.
+	 */
+	if (unlikely(PageHWPoison(page)))
+		return false;
+	return page_count(page) > 0;
+}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index d595606728f9..0eb1a1df649d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -30,6 +30,7 @@
 #include <linux/mm_inline.h>
 #include <linux/firmware-map.h>
 #include <linux/stop_machine.h>
+#include <linux/hugetlb.h>
 
 #include <asm/tlbflush.h>
 
@@ -1230,10 +1231,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 }
 
 /*
- * Scanning pfn is much easier than scanning lru list.
- * Scan pfn from start to end and Find LRU page.
+ * Scan pfn range [start,end) to find movable/migratable pages (LRU pages
+ * and hugepages). We scan pfn because it's much easier than scanning over
+ * linked list. This function returns the pfn of the first found movable
+ * page if it's found, otherwise 0.
  */
-static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
+static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
 {
 	unsigned long pfn;
 	struct page *page;
@@ -1242,6 +1245,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 			page = pfn_to_page(pfn);
 			if (PageLRU(page))
 				return pfn;
+			if (PageHuge(page)) {
+				if (is_hugepage_active(page))
+					return pfn;
+				else
+					pfn = round_up(pfn + 1,
+						1 << compound_order(page)) - 1;
+			}
 		}
 	}
 	return 0;
@@ -1262,6 +1272,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		if (!pfn_valid(pfn))
 			continue;
 		page = pfn_to_page(pfn);
+
+		if (PageHuge(page)) {
+			struct page *head = compound_head(page);
+			pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
+			if (compound_order(head) > PFN_SECTION_SHIFT) {
+				ret = -EBUSY;
+				break;
+			}
+			if (isolate_huge_page(page, &source))
+				move_pages -= 1 << compound_order(head);
+			continue;
+		}
+
 		if (!get_page_unless_zero(page))
 			continue;
 		/*
@@ -1294,7 +1317,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 	}
 	if (!list_empty(&source)) {
 		if (not_managed) {
-			putback_lru_pages(&source);
+			putback_movable_pages(&source);
 			goto out;
 		}
 
@@ -1305,7 +1328,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 		ret = migrate_pages(&source, alloc_migrate_target, 0,
 					MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
 		if (ret)
-			putback_lru_pages(&source);
+			putback_movable_pages(&source);
 	}
 out:
 	return ret;
@@ -1548,8 +1571,8 @@ repeat:
 		drain_all_pages();
 	}
 
-	pfn = scan_lru_pages(start_pfn, end_pfn);
-	if (pfn) { /* We have page on LRU */
+	pfn = scan_movable_pages(start_pfn, end_pfn);
+	if (pfn) { /* We have movable pages */
 		ret = do_migrate_range(pfn, end_pfn);
 		if (!ret) {
 			drain = 1;
@@ -1568,6 +1591,11 @@ repeat:
 	yield();
 	/* drain pcp pages, this is synchronous. */
 	drain_all_pages();
+	/*
+	 * dissolve free hugepages in the memory block before doing offlining
+	 * actually in order to make hugetlbfs's object counting consistent.
+	 */
+	dissolve_free_huge_pages(start_pfn, end_pfn);
 	/* check again */
 	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
 	if (offlined_pages < 0) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7c3f8d7e2d8e..f7cc08dad26a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6008,6 +6008,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 			continue;
 
 		page = pfn_to_page(check);
+
+		/*
+		 * Hugepages are not in LRU lists, but they're movable.
+		 * We need not scan over tail pages bacause we don't
+		 * handle each tail page individually in migration.
+		 */
+		if (PageHuge(page)) {
+			iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
+			continue;
+		}
+
 		/*
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 0cee10ffb98d..d1473b2e9481 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -6,6 +6,7 @@
 #include <linux/page-isolation.h>
 #include <linux/pageblock-flags.h>
 #include <linux/memory.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
 
 int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages)
@@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
 {
 	gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE;
 
+	/*
+	 * TODO: allocate a destination hugepage from a nearest neighbor node,
+	 * accordance with memory policy of the user process if possible. For
+	 * now as a simple work-around, we use the next node for destination.
+	 */
+	if (PageHuge(page)) {
+		nodemask_t src = nodemask_of_node(page_to_nid(page));
+		nodemask_t dst;
+		nodes_complement(dst, src);
+		return alloc_huge_page_node(page_hstate(compound_head(page)),
+					    next_node(page_to_nid(page), dst));
+	}
+
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
 

From 83467efbdb7948146581a56cbd683a22a0684bbb Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:11 -0700
Subject: [PATCH 100/303] mm: migrate: check movability of hugepage in
 unmap_and_move_huge_page()

Currently hugepage migration works well only for pmd-based hugepages
(mainly due to lack of testing,) so we had better not enable migration of
other levels of hugepages until we are ready for it.

Some users of hugepage migration (mbind, move_pages, and migrate_pages) do
page table walk and check pud/pmd_huge() there, so they are safe.  But the
other users (softoffline and memory hotremove) don't do this, so without
this patch they can try to migrate unexpected types of hugepages.

To prevent this, we introduce hugepage_migration_support() as an
architecture dependent check of whether hugepage are implemented on a pmd
basis or not.  And on some architecture multiple sizes of hugepages are
available, so hugepage_migration_support() also checks hugepage size.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/hugetlbpage.c     |  5 +++++
 arch/arm64/mm/hugetlbpage.c   |  5 +++++
 arch/ia64/mm/hugetlbpage.c    |  5 +++++
 arch/metag/mm/hugetlbpage.c   |  5 +++++
 arch/mips/mm/hugetlbpage.c    |  5 +++++
 arch/powerpc/mm/hugetlbpage.c | 10 ++++++++++
 arch/s390/mm/hugetlbpage.c    |  5 +++++
 arch/sh/mm/hugetlbpage.c      |  5 +++++
 arch/sparc/mm/hugetlbpage.c   |  5 +++++
 arch/tile/mm/hugetlbpage.c    |  5 +++++
 arch/x86/mm/hugetlbpage.c     |  8 ++++++++
 include/linux/hugetlb.h       | 12 ++++++++++++
 mm/migrate.c                  | 10 ++++++++++
 13 files changed, 85 insertions(+)

diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c
index 66781bf34077..54ee6163c181 100644
--- a/arch/arm/mm/hugetlbpage.c
+++ b/arch/arm/mm/hugetlbpage.c
@@ -56,3 +56,8 @@ int pmd_huge(pmd_t pmd)
 {
 	return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT);
 }
+
+int pmd_huge_support(void)
+{
+	return 1;
+}
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 2fc8258bab2d..5e9aec358306 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -54,6 +54,11 @@ int pud_huge(pud_t pud)
 	return !(pud_val(pud) & PUD_TABLE_BIT);
 }
 
+int pmd_huge_support(void)
+{
+	return 1;
+}
+
 static __init int setup_hugepagesz(char *opt)
 {
 	unsigned long ps = memparse(opt, &opt);
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 76069c18ee42..68232db98baa 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -114,6 +114,11 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
+int pmd_huge_support(void)
+{
+	return 0;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write)
 {
diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c
index 3c52fa6d0f8e..042431509b56 100644
--- a/arch/metag/mm/hugetlbpage.c
+++ b/arch/metag/mm/hugetlbpage.c
@@ -110,6 +110,11 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
+int pmd_huge_support(void)
+{
+	return 1;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmd, int write)
 {
diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c
index a7fee0dfb7a9..01fda4419ed0 100644
--- a/arch/mips/mm/hugetlbpage.c
+++ b/arch/mips/mm/hugetlbpage.c
@@ -85,6 +85,11 @@ int pud_huge(pud_t pud)
 	return (pud_val(pud) & _PAGE_HUGE) != 0;
 }
 
+int pmd_huge_support(void)
+{
+	return 1;
+}
+
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 834ca8eb38f2..d67db4bd672d 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -86,6 +86,11 @@ int pgd_huge(pgd_t pgd)
 	 */
 	return ((pgd_val(pgd) & 0x3) != 0x0);
 }
+
+int pmd_huge_support(void)
+{
+	return 1;
+}
 #else
 int pmd_huge(pmd_t pmd)
 {
@@ -101,6 +106,11 @@ int pgd_huge(pgd_t pgd)
 {
 	return 0;
 }
+
+int pmd_huge_support(void)
+{
+	return 0;
+}
 #endif
 
 pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 248445f92604..d261c62e40a6 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -223,6 +223,11 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
+int pmd_huge_support(void)
+{
+	return 1;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmdp, int write)
 {
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index d7762349ea48..0d676a41081e 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -83,6 +83,11 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
+int pmd_huge_support(void)
+{
+	return 0;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmd, int write)
 {
diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c
index d2b59441ebdd..96399646570a 100644
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -234,6 +234,11 @@ int pud_huge(pud_t pud)
 	return 0;
 }
 
+int pmd_huge_support(void)
+{
+	return 0;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmd, int write)
 {
diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c
index e514899e1100..0cb3bbaa580c 100644
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -166,6 +166,11 @@ int pud_huge(pud_t pud)
 	return !!(pud_val(pud) & _PAGE_HUGE_PAGE);
 }
 
+int pmd_huge_support(void)
+{
+	return 1;
+}
+
 struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 			     pmd_t *pmd, int write)
 {
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 7e73e8c69096..9d980d88b747 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
+int pmd_huge_support(void)
+{
+	return 0;
+}
 #else
 
 struct page *
@@ -77,6 +81,10 @@ int pud_huge(pud_t pud)
 	return !!(pud_val(pud) & _PAGE_PSE);
 }
 
+int pmd_huge_support(void)
+{
+	return 1;
+}
 #endif
 
 /* x86_64 also uses this file */
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2e02c4ed1035..0393270466c3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -381,6 +381,16 @@ static inline pgoff_t basepage_index(struct page *page)
 
 extern void dissolve_free_huge_pages(unsigned long start_pfn,
 				     unsigned long end_pfn);
+int pmd_huge_support(void);
+/*
+ * Currently hugepage migration is enabled only for pmd-based hugepage.
+ * This function will be updated when hugepage migration is more widely
+ * supported.
+ */
+static inline int hugepage_migration_support(struct hstate *h)
+{
+	return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT);
+}
 
 #else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
@@ -409,6 +419,8 @@ static inline pgoff_t basepage_index(struct page *page)
 	return page->index;
 }
 #define dissolve_free_huge_pages(s, e)	do {} while (0)
+#define pmd_huge_support()	0
+#define hugepage_migration_support(h)	0
 #endif	/* CONFIG_HUGETLB_PAGE */
 
 #endif /* _LINUX_HUGETLB_H */
diff --git a/mm/migrate.c b/mm/migrate.c
index d3137375fa80..61f14a1923fd 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -949,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 	struct page *new_hpage = get_new_page(hpage, private, &result);
 	struct anon_vma *anon_vma = NULL;
 
+	/*
+	 * Movability of hugepages depends on architectures and hugepage size.
+	 * This check is necessary because some callers of hugepage migration
+	 * like soft offline and memory hotremove don't walk through page
+	 * tables or check whether the hugepage is pmd-based or not before
+	 * kicking migration.
+	 */
+	if (!hugepage_migration_support(page_hstate(hpage)))
+		return -ENOSYS;
+
 	if (!new_hpage)
 		return -ENOMEM;
 

From 86cdb465cf3a9d81058b517af05074157fa9dcdd Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:13 -0700
Subject: [PATCH 101/303] mm: prepare to remove
 /proc/sys/vm/hugepages_treat_as_movable

Now hugepage migration is enabled, although restricted on pmd-based
hugepages for now (due to lack of testing.) So we should allocate
migratable hugepages from ZONE_MOVABLE if possible.

This patch makes GFP flags in hugepage allocation dependent on migration
support, not only the value of hugepages_treat_as_movable.  It provides no
change on the behavior for architectures which do not support hugepage
migration,

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Andi Kleen <ak@linux.intel.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 28 ++++++++++++++++++----------
 kernel/sysctl.c             |  2 +-
 mm/hugetlb.c                | 32 ++++++++++++++------------------
 3 files changed, 33 insertions(+), 29 deletions(-)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 36ecc26c7433..79a797eb3e87 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -200,17 +200,25 @@ fragmentation index is <= extfrag_threshold. The default value is 500.
 
 hugepages_treat_as_movable
 
-This parameter is only useful when kernelcore= is specified at boot time to
-create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages
-are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero
-value written to hugepages_treat_as_movable allows huge pages to be allocated
-from ZONE_MOVABLE.
+This parameter controls whether we can allocate hugepages from ZONE_MOVABLE
+or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE.
+ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified,
+so this parameter has no effect if used without kernelcore=.
 
-Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge
-pages pool can easily grow or shrink within. Assuming that applications are
-not running that mlock() a lot of memory, it is likely the huge pages pool
-can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value
-into nr_hugepages and triggering page reclaim.
+Hugepage migration is now available in some situations which depend on the
+architecture and/or the hugepage size. If a hugepage supports migration,
+allocation from ZONE_MOVABLE is always enabled for the hugepage regardless
+of the value of this parameter.
+IOW, this parameter affects only non-migratable hugepages.
+
+Assuming that hugepages are not migratable in your system, one usecase of
+this parameter is that users can make hugepage pool more extensible by
+enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE
+page reclaim/migration/compaction work more and you can get contiguous
+memory more likely. Note that using ZONE_MOVABLE for non-migratable
+hugepages can do harm to other features like memory hotremove (because
+memory hotremove expects that memory blocks on ZONE_MOVABLE are always
+removable,) so it's a trade-off responsible for the users.
 
 ==============================================================
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 07f6fc468e17..dc69093a8ec4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &hugepages_treat_as_movable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= hugetlb_treat_movable_handler,
+		.proc_handler	= proc_dointvec,
 	},
 	{
 		.procname	= "nr_overcommit_hugepages",
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fb4293b93fd0..b49579c7f2a5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,7 +34,6 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
 
 int hugetlb_max_hstate __read_mostly;
@@ -539,6 +538,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
 	return page;
 }
 
+/* Movability of hugepages depends on migration support. */
+static inline gfp_t htlb_alloc_mask(struct hstate *h)
+{
+	if (hugepages_treat_as_movable || hugepage_migration_support(h))
+		return GFP_HIGHUSER_MOVABLE;
+	else
+		return GFP_HIGHUSER;
+}
+
 static struct page *dequeue_huge_page_vma(struct hstate *h,
 				struct vm_area_struct *vma,
 				unsigned long address, int avoid_reserve,
@@ -568,11 +576,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
 retry_cpuset:
 	cpuset_mems_cookie = get_mems_allowed();
 	zonelist = huge_zonelist(vma, address,
-					htlb_alloc_mask, &mpol, &nodemask);
+					htlb_alloc_mask(h), &mpol, &nodemask);
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						MAX_NR_ZONES - 1, nodemask) {
-		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
+		if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) {
 			page = dequeue_huge_page_node(h, zone_to_nid(zone));
 			if (page) {
 				if (avoid_reserve)
@@ -738,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 		return NULL;
 
 	page = alloc_pages_exact_node(nid,
-		htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+		htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
 						__GFP_REPEAT|__GFP_NOWARN,
 		huge_page_order(h));
 	if (page) {
@@ -965,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
 	spin_unlock(&hugetlb_lock);
 
 	if (nid == NUMA_NO_NODE)
-		page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
+		page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP|
 				   __GFP_REPEAT|__GFP_NOWARN,
 				   huge_page_order(h));
 	else
 		page = alloc_pages_exact_node(nid,
-			htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
+			htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
 			__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
 
 	if (page && arch_prepare_hugepage(page)) {
@@ -2117,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_NUMA */
 
-int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-			void __user *buffer,
-			size_t *length, loff_t *ppos)
-{
-	proc_dointvec(table, write, buffer, length, ppos);
-	if (hugepages_treat_as_movable)
-		htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
-	else
-		htlb_alloc_mask = GFP_HIGHUSER;
-	return 0;
-}
-
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
 			void __user *buffer,
 			size_t *length, loff_t *ppos)

From 98094945785464c657d598291d714d11694c8cd9 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:14 -0700
Subject: [PATCH 102/303] mm/mempolicy: rename check_*range to
 queue_pages_*range

The function check_range() (and its family) is not well-named, because it
does not only checking something, but moving pages from list to list to do
page migration for them.  So queue_pages_*range is more desirable name.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Hillf Danton <dhillf@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c7c359213ae1..9d778637b088 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -476,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags);
 
-/* Scan through pages checking if pages follow certain conditions. */
-static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+/*
+ * Scan through pages checking if pages follow certain conditions,
+ * and move them to the pagelist if they do.
+ */
+static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
@@ -515,8 +518,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	return addr != end;
 }
 
-static void check_hugetlb_pmd_range(struct vm_area_struct *vma, pmd_t *pmd,
-		const nodemask_t *nodes, unsigned long flags,
+static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
+		pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
 				    void *private)
 {
 #ifdef CONFIG_HUGETLB_PAGE
@@ -539,7 +542,7 @@ unlock:
 #endif
 }
 
-static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
@@ -553,21 +556,21 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		if (!pmd_present(*pmd))
 			continue;
 		if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
-			check_hugetlb_pmd_range(vma, pmd, nodes,
+			queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
 						flags, private);
 			continue;
 		}
 		split_huge_page_pmd(vma, addr, pmd);
 		if (pmd_none_or_trans_huge_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(vma, pmd, addr, next, nodes,
+		if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
 				    flags, private))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
@@ -582,14 +585,14 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 			continue;
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(vma, pud, addr, next, nodes,
+		if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
 				    flags, private))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pgd_range(struct vm_area_struct *vma,
+static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags,
 		void *private)
@@ -602,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(vma, pgd, addr, next, nodes,
+		if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
 				    flags, private))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
@@ -640,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */
 
 /*
- * Check if all pages in a range are on a set of nodes.
- * If pagelist != NULL then isolate pages from the LRU and
- * put them on the pagelist.
+ * Walk through page tables and collect pages to be migrated.
+ *
+ * If pages found in a given range are on a set of nodes (determined by
+ * @nodes and @flags,) it's isolated and queued to the pagelist which is
+ * passed via @private.)
  */
 static struct vm_area_struct *
-check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
+queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags, void *private)
 {
 	int err;
@@ -680,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		     ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 		      vma_migratable(vma))) {
 
-			err = check_pgd_range(vma, start, endvma, nodes,
+			err = queue_pages_pgd_range(vma, start, endvma, nodes,
 						flags, private);
 			if (err) {
 				first = ERR_PTR(err);
@@ -1050,7 +1055,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 	 * space range and MPOL_MF_DISCONTIG_OK, this call can not fail.
 	 */
 	VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)));
-	check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
+	queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
 			flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 
 	if (!list_empty(&pagelist)) {
@@ -1288,7 +1293,7 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (err)
 		goto mpol_out;
 
-	vma = check_range(mm, start, end, nmask,
+	vma = queue_pages_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
 
 	err = PTR_ERR(vma);	/* maybe ... */

From 0bf598d863e3c741d47e3178d645f04c9d6c186c Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:16 -0700
Subject: [PATCH 103/303] mbind: add BUG_ON(!vma) in new_vma_page()

new_vma_page() is called only by page migration called from do_mbind(),
where pages to be migrated are queued into a pagelist by
queue_pages_range().  queue_pages_range() confirms that a queued page
belongs to some vma, so !vma case is not supposed to be happen.  This
patch adds BUG_ON() to catch this unexpected case.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9d778637b088..04729647f359 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1196,12 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
 			break;
 		vma = vma->vm_next;
 	}
+	/*
+	 * queue_pages_range() confirms that @page belongs to some vma,
+	 * so vma shouldn't be NULL.
+	 */
+	BUG_ON(!vma);
 
 	if (PageHuge(page))
 		return alloc_huge_page_noerr(vma, address, 1);
-	/*
-	 * if !vma, alloc_page_vma() will use task or system default policy
-	 */
 	return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 }
 #else

From e76b63f80d938a1319eb5fb0ae7ea69bddfbae38 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 11 Sep 2013 14:22:17 -0700
Subject: [PATCH 104/303] memblock, numa: binary search node id

Current early_pfn_to_nid() on arch that support memblock go over
memblock.memory one by one, so will take too many try near the end.

We can use existing memblock_search to find the node id for given pfn,
that could save some time on bigger system that have many entries
memblock.memory array.

Here are the timing differences for several machines.  In each case with
the patch less time was spent in __early_pfn_to_nid().

                        3.11-rc5        with patch      difference (%)
                        --------        ----------      --------------
UV1: 256 nodes  9TB:     411.66          402.47         -9.19 (2.23%)
UV2: 255 nodes 16TB:    1141.02         1138.12         -2.90 (0.25%)
UV2:  64 nodes  2TB:     128.15          126.53         -1.62 (1.26%)
UV2:  32 nodes  2TB:     121.87          121.07         -0.80 (0.66%)
                        Time in seconds.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Tejun Heo <tj@kernel.org>
Acked-by: Russ Anderson <rja@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memblock.h |  2 ++
 mm/memblock.c            | 18 ++++++++++++++++++
 mm/page_alloc.c          | 19 +++++++++----------
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f388203db7e8..31e95acddb4d 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -60,6 +60,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size);
 void memblock_trim_memory(phys_addr_t align);
 
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn,
+			    unsigned long  *end_pfn);
 void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn,
 			  unsigned long *out_end_pfn, int *out_nid);
 
diff --git a/mm/memblock.c b/mm/memblock.c
index a847bfe6f3ba..0ac412a0a7ee 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr)
 	return memblock_search(&memblock.memory, addr) != -1;
 }
 
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
+			 unsigned long *start_pfn, unsigned long *end_pfn)
+{
+	struct memblock_type *type = &memblock.memory;
+	int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT);
+
+	if (mid == -1)
+		return -1;
+
+	*start_pfn = type->regions[mid].base >> PAGE_SHIFT;
+	*end_pfn = (type->regions[mid].base + type->regions[mid].size)
+			>> PAGE_SHIFT;
+
+	return type->regions[mid].nid;
+}
+#endif
+
 /**
  * memblock_is_region_memory - check if a region is a subset of memory
  * @base: base of region to check
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f7cc08dad26a..22653e34a047 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4306,7 +4306,7 @@ int __meminit init_currently_empty_zone(struct zone *zone,
 int __meminit __early_pfn_to_nid(unsigned long pfn)
 {
 	unsigned long start_pfn, end_pfn;
-	int i, nid;
+	int nid;
 	/*
 	 * NOTE: The following SMP-unsafe globals are only used early in boot
 	 * when the kernel is running single-threaded.
@@ -4317,15 +4317,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn)
 	if (last_start_pfn <= pfn && pfn < last_end_pfn)
 		return last_nid;
 
-	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
-		if (start_pfn <= pfn && pfn < end_pfn) {
-			last_start_pfn = start_pfn;
-			last_end_pfn = end_pfn;
-			last_nid = nid;
-			return nid;
-		}
-	/* This is a memory hole */
-	return -1;
+	nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+	if (nid != -1) {
+		last_start_pfn = start_pfn;
+		last_end_pfn = end_pfn;
+		last_nid = nid;
+	}
+
+	return nid;
 }
 #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
 

From 90c7a79cc45becc6cdb8c026d55ace19e299a02d Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Wed, 11 Sep 2013 14:22:18 -0700
Subject: [PATCH 105/303] kmemcg: don't allocate extra memory for root
 memcg_cache_params

The memcg_cache_params structure contains the common part and the union,
which represents two different types of data: one for root cashes and
another for child caches.

The size of child data is fixed.  The size of the memcg_caches array is
calculated in runtime.

Currently the size of memcg_cache_params for root caches is calculated
incorrectly, because it includes the size of parameters for child caches.

ssize_t size = memcg_caches_array_size(num_groups);
size *= sizeof(void *);

size += sizeof(struct memcg_cache_params);

v2: Fix a typo in calculations

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Cc: Glauber Costa <glommer@openvz.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3b83957b6439..5ca1dcf77ce9 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 		ssize_t size = memcg_caches_array_size(num_groups);
 
 		size *= sizeof(void *);
-		size += sizeof(struct memcg_cache_params);
+		size += offsetof(struct memcg_cache_params, memcg_caches);
 
 		s->memcg_params = kzalloc(size, GFP_KERNEL);
 		if (!s->memcg_params) {
@@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
 			 struct kmem_cache *root_cache)
 {
-	size_t size = sizeof(struct memcg_cache_params);
+	size_t size;
 
 	if (!memcg_kmem_enabled())
 		return 0;
 
-	if (!memcg)
+	if (!memcg) {
+		size = offsetof(struct memcg_cache_params, memcg_caches);
 		size += memcg_limited_groups_array_size * sizeof(void *);
+	} else
+		size = sizeof(struct memcg_cache_params);
 
 	s->memcg_params = kzalloc(size, GFP_KERNEL);
 	if (!s->memcg_params)

From 3a7200af3d9227767869f451ed747aff07d8df48 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Wed, 11 Sep 2013 14:22:19 -0700
Subject: [PATCH 106/303] mm: compaction: do not compact pgdat for order-0

If kswapd was reclaiming for a high order and resets it to 0 due to
fragmentation it will still call compact_pgdat.  For the most part, this
will fail a compaction_suitable() test and not compact but it is
unnecessarily sloppy.  It could be fixed in the caller but fix it in the
API instead.

[dhillf@gmail.com: pointed out that it was a potential problem]
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Hillf Danton <dhillf@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/compaction.c b/mm/compaction.c
index 05ccb4cc0bdb..c43789388cd8 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order)
 		.sync = false,
 	};
 
+	if (!order)
+		return;
+
 	__compact_pgdat(pgdat, &cc);
 }
 

From 7cb2ef56e6a8b7b368b2e883a0a47d02fed66911 Mon Sep 17 00:00:00 2001
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Wed, 11 Sep 2013 14:22:20 -0700
Subject: [PATCH 107/303] mm: fix aio performance regression for database
 caused by THP

I am working with a tool that simulates oracle database I/O workload.
This tool (orion to be specific -
<http://docs.oracle.com/cd/E11882_01/server.112/e16638/iodesign.htm#autoId24>)
allocates hugetlbfs pages using shmget() with SHM_HUGETLB flag.  It then
does aio into these pages from flash disks using various common block
sizes used by database.  I am looking at performance with two of the most
common block sizes - 1M and 64K.  aio performance with these two block
sizes plunged after Transparent HugePages was introduced in the kernel.
Here are performance numbers:

		pre-THP		2.6.39		3.11-rc5
1M read		8384 MB/s	5629 MB/s	6501 MB/s
64K read	7867 MB/s	4576 MB/s	4251 MB/s

I have narrowed the performance impact down to the overheads introduced by
THP in __get_page_tail() and put_compound_page() routines.  perf top shows
>40% of cycles being spent in these two routines.  Every time direct I/O
to hugetlbfs pages starts, kernel calls get_page() to grab a reference to
the pages and calls put_page() when I/O completes to put the reference
away.  THP introduced significant amount of locking overhead to get_page()
and put_page() when dealing with compound pages because hugepages can be
split underneath get_page() and put_page().  It added this overhead
irrespective of whether it is dealing with hugetlbfs pages or transparent
hugepages.  This resulted in 20%-45% drop in aio performance when using
hugetlbfs pages.

Since hugetlbfs pages can not be split, there is no reason to go through
all the locking overhead for these pages from what I can see.  I added
code to __get_page_tail() and put_compound_page() to bypass all the
locking code when working with hugetlbfs pages.  This improved performance
significantly.  Performance numbers with this patch:

		pre-THP		3.11-rc5	3.11-rc5 + Patch
1M read		8384 MB/s	6501 MB/s	8371 MB/s
64K read	7867 MB/s	4251 MB/s	6510 MB/s

Performance with 64K read is still lower than what it was before THP, but
still a 53% improvement.  It does mean there is more work to be done but I
will take a 53% improvement for now.

Please take a look at the following patch and let me know if it looks
reasonable.

[akpm@linux-foundation.org: tweak comments]
Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
Cc: Pravin B Shelar <pshelar@nicira.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Rik van Riel <riel@redhat.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 77 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 52 insertions(+), 25 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 62b78a6e224f..c899502d3e36 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,6 +31,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/hugetlb.h>
 
 #include "internal.h"
 
@@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page)
 
 static void put_compound_page(struct page *page)
 {
+	/*
+	 * hugetlbfs pages cannot be split from under us.  If this is a
+	 * hugetlbfs page, check refcount on head page and release the page if
+	 * the refcount becomes zero.
+	 */
+	if (PageHuge(page)) {
+		page = compound_head(page);
+		if (put_page_testzero(page))
+			__put_compound_page(page);
+
+		return;
+	}
+
 	if (unlikely(PageTail(page))) {
 		/* __split_huge_page_refcount can run under us */
 		struct page *page_head = compound_trans_head(page);
@@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page)
 	 * proper PT lock that already serializes against
 	 * split_huge_page().
 	 */
-	unsigned long flags;
 	bool got = false;
-	struct page *page_head = compound_trans_head(page);
+	struct page *page_head;
 
-	if (likely(page != page_head && get_page_unless_zero(page_head))) {
+	/*
+	 * If this is a hugetlbfs page it cannot be split under us.  Simply
+	 * increment refcount for the head page.
+	 */
+	if (PageHuge(page)) {
+		page_head = compound_head(page);
+		atomic_inc(&page_head->_count);
+		got = true;
+	} else {
+		unsigned long flags;
 
-		/* Ref to put_compound_page() comment. */
-		if (PageSlab(page_head)) {
+		page_head = compound_trans_head(page);
+		if (likely(page != page_head &&
+					get_page_unless_zero(page_head))) {
+
+			/* Ref to put_compound_page() comment. */
+			if (PageSlab(page_head)) {
+				if (likely(PageTail(page))) {
+					__get_page_tail_foll(page, false);
+					return true;
+				} else {
+					put_page(page_head);
+					return false;
+				}
+			}
+
+			/*
+			 * page_head wasn't a dangling pointer but it
+			 * may not be a head page anymore by the time
+			 * we obtain the lock. That is ok as long as it
+			 * can't be freed from under us.
+			 */
+			flags = compound_lock_irqsave(page_head);
+			/* here __split_huge_page_refcount won't run anymore */
 			if (likely(PageTail(page))) {
 				__get_page_tail_foll(page, false);
-				return true;
-			} else {
-				put_page(page_head);
-				return false;
+				got = true;
 			}
+			compound_unlock_irqrestore(page_head, flags);
+			if (unlikely(!got))
+				put_page(page_head);
 		}
-
-		/*
-		 * page_head wasn't a dangling pointer but it
-		 * may not be a head page anymore by the time
-		 * we obtain the lock. That is ok as long as it
-		 * can't be freed from under us.
-		 */
-		flags = compound_lock_irqsave(page_head);
-		/* here __split_huge_page_refcount won't run anymore */
-		if (likely(PageTail(page))) {
-			__get_page_tail_foll(page, false);
-			got = true;
-		}
-		compound_unlock_irqrestore(page_head, flags);
-		if (unlikely(!got))
-			put_page(page_head);
 	}
 	return got;
 }

From 47df3ddedd22c3f8e68aff831edb7921937674a2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Sep 2013 14:22:22 -0700
Subject: [PATCH 108/303] writeback: fix occasional slow sync(1)

In case when system contains no dirty pages, wakeup_flusher_threads() will
submit WB_SYNC_NONE writeback for 0 pages so wb_writeback() exits
immediately without doing anything, even though there are dirty inodes in
the system.  Thus sync(1) will write all the dirty inodes from a
WB_SYNC_ALL writeback pass which is slow.

Fix the problem by using get_nr_dirty_pages() in wakeup_flusher_threads()
instead of calculating number of dirty pages manually.  That function also
takes number of dirty inodes into account.

Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Paul Taysom <taysom@chromium.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 68851ff2fd41..87d778118027 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1049,10 +1049,8 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
-	if (!nr_pages) {
-		nr_pages = global_page_state(NR_FILE_DIRTY) +
-				global_page_state(NR_UNSTABLE_NFS);
-	}
+	if (!nr_pages)
+		nr_pages = get_nr_dirty_pages();
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {

From 3b11f0aaae830f0f569cb8fb7fd26f4133ebdabd Mon Sep 17 00:00:00 2001
From: SeungHun Lee <waydi1@gmail.com>
Date: Wed, 11 Sep 2013 14:22:23 -0700
Subject: [PATCH 109/303] mm: page_alloc: fix comment get_page_from_freelist

cpuset_zone_allowed is changed to cpuset_zone_allowed_softwall and the
comment is moved to __cpuset_node_allowed_softwall.  So fix this comment.

Signed-off-by: SeungHun Lee <waydi1@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 22653e34a047..7b1b706a1ffa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1896,7 +1896,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
 zonelist_scan:
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
-	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+	 * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c.
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
 						high_zoneidx, nodemask) {

From d9104d1ca9662498339c0de975b4666c30485f4e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 11 Sep 2013 14:22:24 -0700
Subject: [PATCH 110/303] mm: track vma changes with VM_SOFTDIRTY bit

Pavel reported that in case if vma area get unmapped and then mapped (or
expanded) in-place, the soft dirty tracker won't be able to recognize this
situation since it works on pte level and ptes are get zapped on unmap,
loosing soft dirty bit of course.

So to resolve this situation we need to track actions on vma level, there
VM_SOFTDIRTY flag comes in.  When new vma area created (or old expanded)
we set this bit, and keep it here until application calls for clearing
soft dirty bit.

Thus when user space application track memory changes now it can detect if
vma area is renewed.

Reported-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rob Landley <rob@landley.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/soft-dirty.txt |  7 +++++
 fs/exec.c                       |  2 +-
 fs/proc/task_mmu.c              | 46 ++++++++++++++++++++++++++-------
 include/linux/mm.h              |  6 +++++
 mm/mmap.c                       | 12 ++++++++-
 5 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt
index 9a12a5956bc0..55684d11a1e8 100644
--- a/Documentation/vm/soft-dirty.txt
+++ b/Documentation/vm/soft-dirty.txt
@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all
 the kernel does is finds this fact out and puts both writable and soft-dirty
 bits on the PTE.
 
+  While in most cases tracking memory changes by #PF-s is more than enough
+there is still a scenario when we can lose soft dirty bits -- a task
+unmaps a previously mapped memory region and then maps a new one at exactly
+the same place. When unmap is called, the kernel internally clears PTE values
+including soft dirty bits. To notify user space application about such
+memory region renewal the kernel always marks new memory regions (and
+expanded regions) as soft dirty.
 
   This feature is actively used by the checkpoint-restore project. You
 can find more details about it on http://criu.org
diff --git a/fs/exec.c b/fs/exec.c
index fd774c7cb483..2d1e52a58fe9 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
 	vma->vm_end = STACK_TOP_MAX;
 	vma->vm_start = vma->vm_end - PAGE_SIZE;
-	vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
+	vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 107d026f5d6e..09228639b83d 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		ptent = pte_file_clear_soft_dirty(ptent);
 	}
 
+	if (vma->vm_flags & VM_SOFTDIRTY)
+		vma->vm_flags &= ~VM_SOFTDIRTY;
+
 	set_pte_at(vma->vm_mm, addr, pte, ptent);
 #endif
 }
@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
 		if (is_migration_entry(entry))
 			page = migration_entry_to_page(entry);
 	} else {
-		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
+		if (vma->vm_flags & VM_SOFTDIRTY)
+			flags2 |= __PM_SOFT_DIRTY;
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
 		return;
 	}
 
 	if (page && !PageAnon(page))
 		flags |= PM_FILE;
-	if (pte_soft_dirty(pte))
+	if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
 		flags2 |= __PM_SOFT_DIRTY;
 
 	*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
 		*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
 				| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
 	else
-		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
 }
 #else
 static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
 		int pmd_flags2;
 
-		pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0);
+		if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
+			pmd_flags2 = __PM_SOFT_DIRTY;
+		else
+			pmd_flags2 = 0;
+
 		for (; addr != end; addr += PAGE_SIZE) {
 			unsigned long offset;
 
@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	if (pmd_trans_unstable(pmd))
 		return 0;
 	for (; addr != end; addr += PAGE_SIZE) {
+		int flags2;
 
 		/* check to see if we've left 'vma' behind
 		 * and need a new, higher one */
 		if (vma && (addr >= vma->vm_end)) {
 			vma = find_vma(walk->mm, addr);
-			pme = make_pme(PM_NOT_PRESENT(pm->v2));
+			if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+				flags2 = __PM_SOFT_DIRTY;
+			else
+				flags2 = 0;
+			pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
 		}
 
 		/* check that 'vma' actually covers this address,
@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 
 #ifdef CONFIG_HUGETLB_PAGE
 static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
-					pte_t pte, int offset)
+					pte_t pte, int offset, int flags2)
 {
 	if (pte_present(pte))
-		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)
-				| PM_STATUS2(pm->v2, 0) | PM_PRESENT);
+		*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset)	|
+				PM_STATUS2(pm->v2, flags2)		|
+				PM_PRESENT);
 	else
-		*pme = make_pme(PM_NOT_PRESENT(pm->v2));
+		*pme = make_pme(PM_NOT_PRESENT(pm->v2)			|
+				PM_STATUS2(pm->v2, flags2));
 }
 
 /* This function walks within one hugetlb entry in the single call */
@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
 				 struct mm_walk *walk)
 {
 	struct pagemapread *pm = walk->private;
+	struct vm_area_struct *vma;
 	int err = 0;
+	int flags2;
 	pagemap_entry_t pme;
 
+	vma = find_vma(walk->mm, addr);
+	WARN_ON_ONCE(!vma);
+
+	if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+		flags2 = __PM_SOFT_DIRTY;
+	else
+		flags2 = 0;
+
 	for (; addr != end; addr += PAGE_SIZE) {
 		int offset = (addr & ~hmask) >> PAGE_SHIFT;
-		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset);
+		huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
 		err = add_to_pagemap(addr, &pme, pm);
 		if (err)
 			return err;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index d2d59b4149d0..dce24569f8fc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+# define VM_SOFTDIRTY	0x08000000	/* Not soft dirty clean area */
+#else
+# define VM_SOFTDIRTY	0
+#endif
+
 #define VM_MIXEDMAP	0x10000000	/* Can contain "struct page" and pure PFN pages */
 #define VM_HUGEPAGE	0x20000000	/* MADV_HUGEPAGE marked this vma */
 #define VM_NOHUGEPAGE	0x40000000	/* MADV_NOHUGEPAGE marked this vma */
diff --git a/mm/mmap.c b/mm/mmap.c
index 13926a5a6901..51958d192a48 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1609,6 +1609,15 @@ out:
 	if (file)
 		uprobe_mmap(vma);
 
+	/*
+	 * New (or expanded) vma always get soft dirty status.
+	 * Otherwise user-space soft-dirty page tracker won't
+	 * be able to distinguish situation when vma area unmapped,
+	 * then new mapped in-place (which must be aimed as
+	 * a completely new data area).
+	 */
+	vma->vm_flags |= VM_SOFTDIRTY;
+
 	return addr;
 
 unmap_and_free_vma:
@@ -2652,6 +2661,7 @@ out:
 	mm->total_vm += len >> PAGE_SHIFT;
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
+	vma->vm_flags |= VM_SOFTDIRTY;
 	return addr;
 }
 
@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm,
 	vma->vm_start = addr;
 	vma->vm_end = addr + len;
 
-	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND;
+	vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
 	vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
 	vma->vm_ops = &special_mapping_vmops;

From 0ec3b74c7f5599c8a4d2b33d430a5470af26ebf6 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:26 -0700
Subject: [PATCH 111/303] mm: putback_lru_page: remove unnecessary call to
 page_lru_base_type()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The goal of this patch series is to improve performance of munlock() of
large mlocked memory areas on systems without THP.  This is motivated by
reported very long times of crash recovery of processes with such areas,
where munlock() can take several seconds.  See
http://lwn.net/Articles/548108/

The work was driven by a simple benchmark (to be included in mmtests) that
mmaps() e.g.  56GB with MAP_LOCKED | MAP_POPULATE and measures the time of
munlock().  Profiling was performed by attaching operf --pid to the
process and sending a signal to trigger the munlock() part and then notify
bach the monitoring wrapper to stop operf, so that only munlock() appears
in the profile.

The profiles have shown that CPU time is spent mostly by atomic operations
and repeated locking per single pages. This series aims to reduce both, starting
from simpler to more complex changes.

Patch 1 performs a simple cleanup in putback_lru_page() so that page lru base
	type is not determined without being actually needed.

Patch 2 removes an unnecessary call to lru_add_drain() which drains the per-cpu
	pagevec after each munlocked page is put there.

Patch 3 changes munlock_vma_range() to use an on-stack pagevec for isolating
	multiple non-THP pages under a single lru_lock instead of locking and
	processing each page separately.

Patch 4 changes the NR_MLOCK accounting to be called only once per the pvec
	introduced by previous patch.

Patch 5 uses the introduced pagevec to batch also the work of putback_lru_page
	when possible, bypassing the per-cpu pvec and associated overhead.

Patch 6 removes a redundant get_page/put_page pair which saves costly atomic
	operations.

Patch 7 avoids calling follow_page_mask() on each individual page, and obtains
	multiple page references under a single page table lock where possible.

Measurements were made using 3.11-rc3 as a baseline.  The first set of
measurements shows the possibly ideal conditions where batching should
help the most.  All memory is allocated from a single NUMA node and THP is
disabled.

timedmunlock
                            3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3
                                   0                     1                     2                     3                     4                     5                     6                     7
Elapsed min           3.38 (  0.00%)        3.39 ( -0.13%)        3.00 ( 11.33%)        2.70 ( 20.20%)        2.67 ( 21.11%)        2.37 ( 29.88%)        2.20 ( 34.91%)        1.91 ( 43.59%)
Elapsed mean          3.39 (  0.00%)        3.40 ( -0.23%)        3.01 ( 11.33%)        2.70 ( 20.26%)        2.67 ( 21.21%)        2.38 ( 29.88%)        2.21 ( 34.93%)        1.92 ( 43.46%)
Elapsed stddev        0.01 (  0.00%)        0.01 (-43.09%)        0.01 ( 15.42%)        0.01 ( 23.42%)        0.00 ( 89.78%)        0.01 ( -7.15%)        0.00 ( 76.69%)        0.02 (-91.77%)
Elapsed max           3.41 (  0.00%)        3.43 ( -0.52%)        3.03 ( 11.29%)        2.72 ( 20.16%)        2.67 ( 21.63%)        2.40 ( 29.50%)        2.21 ( 35.21%)        1.96 ( 42.39%)
Elapsed range         0.03 (  0.00%)        0.04 (-51.16%)        0.02 (  6.27%)        0.02 ( 14.67%)        0.00 ( 88.90%)        0.03 (-19.18%)        0.01 ( 73.70%)        0.06 (-113.35%

The second set of measurements simulates the worst possible conditions for
batching by using numactl --interleave, so that there is in fact only one
page per pagevec.  Even in this case the series seems to improve
performance thanks to reduced atomic operations and removal of
lru_add_drain().

timedmunlock
                            3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3
                                   0                     1                     2                     3                     4                     5                     6                     7
Elapsed min           4.00 (  0.00%)        4.04 ( -0.93%)        3.87 (  3.37%)        3.72 (  6.94%)        3.81 (  4.72%)        3.69 (  7.82%)        3.64 (  8.92%)        3.41 ( 14.81%)
Elapsed mean          4.17 (  0.00%)        4.15 (  0.51%)        4.03 (  3.49%)        3.89 (  6.84%)        3.86 (  7.48%)        3.89 (  6.69%)        3.70 ( 11.27%)        3.48 ( 16.59%)
Elapsed stddev        0.16 (  0.00%)        0.08 ( 50.76%)        0.10 ( 41.58%)        0.16 (  4.59%)        0.05 ( 72.38%)        0.19 (-12.91%)        0.05 ( 68.09%)        0.06 ( 66.03%)
Elapsed max           4.34 (  0.00%)        4.32 (  0.56%)        4.19 (  3.62%)        4.12 (  5.15%)        3.91 (  9.88%)        4.12 (  5.25%)        3.80 ( 12.58%)        3.56 ( 18.08%)
Elapsed range         0.34 (  0.00%)        0.28 ( 17.91%)        0.32 (  6.45%)        0.40 (-15.73%)        0.10 ( 70.06%)        0.43 (-24.84%)        0.15 ( 55.32%)        0.15 ( 56.16%)

For completeness, a third set of measurements shows the situation where
THP is enabled and allocations are again done on a single NUMA node.  Here
munlock() is already very fast thanks to huge pages, and this series does
not compromise that performance.  It seems that the removal of call to
lru_add_drain() still helps a bit.

timedmunlock
                            3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3              3.11-rc3
                                   0                     1                     2                     3                     4                     5                     6                     7
Elapsed min           0.01 (  0.00%)        0.01 ( -0.11%)        0.01 (  6.59%)        0.01 (  5.41%)        0.01 (  5.45%)        0.01 (  5.03%)        0.01 (  6.08%)        0.01 (  5.20%)
Elapsed mean          0.01 (  0.00%)        0.01 ( -0.27%)        0.01 (  6.39%)        0.01 (  5.30%)        0.01 (  5.32%)        0.01 (  5.03%)        0.01 (  5.97%)        0.01 (  5.22%)
Elapsed stddev        0.00 (  0.00%)        0.00 ( -9.59%)        0.00 ( 10.77%)        0.00 (  3.24%)        0.00 ( 24.42%)        0.00 ( 31.86%)        0.00 ( -7.46%)        0.00 (  6.11%)
Elapsed max           0.01 (  0.00%)        0.01 ( -0.01%)        0.01 (  6.83%)        0.01 (  5.42%)        0.01 (  5.79%)        0.01 (  5.53%)        0.01 (  6.08%)        0.01 (  5.26%)
Elapsed range         0.00 (  0.00%)        0.00 (  7.30%)        0.00 ( 24.38%)        0.00 (  6.10%)        0.00 ( 30.79%)        0.00 ( 42.52%)        0.00 (  6.11%)        0.00 ( 10.07%)

This patch (of 7):

In putback_lru_page() since commit c53954a092 (""mm: remove lru parameter
from __lru_cache_add and lru_cache_add_lru") it is no longer needed to
determine lru list via page_lru_base_type().

This patch replaces it with simple flag is_unevictable which says that the
page was put on the inevictable list.  This is the only information that
matters in subsequent tests.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 758540d3ca83..44c072a7cba2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -545,7 +545,7 @@ int remove_mapping(struct address_space *mapping, struct page *page)
  */
 void putback_lru_page(struct page *page)
 {
-	int lru;
+	bool is_unevictable;
 	int was_unevictable = PageUnevictable(page);
 
 	VM_BUG_ON(PageLRU(page));
@@ -560,14 +560,14 @@ redo:
 		 * unevictable page on [in]active list.
 		 * We know how to handle that.
 		 */
-		lru = page_lru_base_type(page);
+		is_unevictable = false;
 		lru_cache_add(page);
 	} else {
 		/*
 		 * Put unevictable pages directly on zone's unevictable
 		 * list.
 		 */
-		lru = LRU_UNEVICTABLE;
+		is_unevictable = true;
 		add_page_to_unevictable_list(page);
 		/*
 		 * When racing with an mlock or AS_UNEVICTABLE clearing
@@ -587,7 +587,7 @@ redo:
 	 * page is on unevictable list, it never be freed. To avoid that,
 	 * check after we added it to the list, again.
 	 */
-	if (lru == LRU_UNEVICTABLE && page_evictable(page)) {
+	if (is_unevictable && page_evictable(page)) {
 		if (!isolate_lru_page(page)) {
 			put_page(page);
 			goto redo;
@@ -598,9 +598,9 @@ redo:
 		 */
 	}
 
-	if (was_unevictable && lru != LRU_UNEVICTABLE)
+	if (was_unevictable && !is_unevictable)
 		count_vm_event(UNEVICTABLE_PGRESCUED);
-	else if (!was_unevictable && lru == LRU_UNEVICTABLE)
+	else if (!was_unevictable && is_unevictable)
 		count_vm_event(UNEVICTABLE_PGCULLED);
 
 	put_page(page);		/* drop ref from isolate */

From 586a32ac1d33ce7a7548a27e4087e98842c3a06f Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:27 -0700
Subject: [PATCH 112/303] mm: munlock: remove unnecessary call to
 lru_add_drain()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In munlock_vma_range(), lru_add_drain() is currently called in a loop
before each munlock_vma_page() call.

This is suboptimal for performance when munlocking many pages.  The
benefits of per-cpu pagevec for batching the LRU putback are removed since
the pagevec only holds at most one page from the previous loop's
iteration.

The lru_add_drain() call also does not serve any purposes for correctness
- it does not even drain pagavecs of all cpu's.  The munlock code already
expects and handles situations where a page cannot be isolated from the
LRU (e.g.  because it is on some per-cpu pagevec).

The history of the (not commented) call also suggest that it appears there
as an oversight rather than intentionally.  Before commit ff6a6da6 ("mm:
accelerate munlock() treatment of THP pages") the call happened only once
upon entering the function.  The commit has moved the call into the while
loope.  So while the other changes in the commit improved munlock
performance for THP pages, it introduced the abovementioned suboptimal
per-cpu pagevec usage.

Further in history, before commit 408e82b7 ("mm: munlock use
follow_page"), munlock_vma_pages_range() was just a wrapper around
__mlock_vma_pages_range which performed both mlock and munlock depending
on a flag.  However, before ba470de4 ("mmap: handle mlocked pages during
map, remap, unmap") the function handled only mlock, not munlock.  The
lru_add_drain call thus comes from the implementation in commit b291f000
("mlock: mlocked pages are unevictable" and was intended only for
mlocking, not munlocking.  The original intention of draining the LRU
pagevec at mlock time was to ensure the pages were on the LRU before the
lock operation so that they could be placed on the unevictable list
immediately.  There is very little motivation to do the same in the
munlock path this, particularly for every single page.

This patch therefore removes the call completely.  After removing the
call, a 10% speedup was measured for munlock() of a 56GB large memory area
with THP disabled.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index 79b7cf7d1bca..b85f1e827610 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -247,7 +247,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 					&page_mask);
 		if (page && !IS_ERR(page)) {
 			lock_page(page);
-			lru_add_drain();
 			/*
 			 * Any THP page found by follow_page_mask() may have
 			 * gotten split before reaching munlock_vma_page(),

From 7225522bb429a2f7dae6667e533e2d735b4882d0 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:29 -0700
Subject: [PATCH 113/303] mm: munlock: batch non-THP page isolation and
 munlock+putback using pagevec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently, munlock_vma_range() calls munlock_vma_page on each page in a
loop, which results in repeated taking and releasing of the lru_lock
spinlock for isolating pages one by one.  This patch batches the munlock
operations using an on-stack pagevec, so that isolation is done under
single lru_lock.  For THP pages, the old behavior is preserved as they
might be split while putting them into the pagevec.  After this patch, a
9% speedup was measured for munlocking a 56GB large memory area with THP
disabled.

A new function __munlock_pagevec() is introduced that takes a pagevec and:
1) It clears PageMlocked and isolates all pages under lru_lock.  Zone page
stats can be also updated using the variant which assumes disabled
interrupts.  2) It finishes the munlock and lru putback on all pages under
their lock_page.  Note that previously, lock_page covered also the
PageMlocked clearing and page isolation, but it is not needed for those
operations.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 196 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 156 insertions(+), 40 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index b85f1e827610..b3b4a78b7802 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/pagemap.h>
+#include <linux/pagevec.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
@@ -18,6 +19,8 @@
 #include <linux/rmap.h>
 #include <linux/mmzone.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
 
 #include "internal.h"
 
@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
 	}
 }
 
+/*
+ * Finish munlock after successful page isolation
+ *
+ * Page must be locked. This is a wrapper for try_to_munlock()
+ * and putback_lru_page() with munlock accounting.
+ */
+static void __munlock_isolated_page(struct page *page)
+{
+	int ret = SWAP_AGAIN;
+
+	/*
+	 * Optimization: if the page was mapped just once, that's our mapping
+	 * and we don't need to check all the other vmas.
+	 */
+	if (page_mapcount(page) > 1)
+		ret = try_to_munlock(page);
+
+	/* Did try_to_unlock() succeed or punt? */
+	if (ret != SWAP_MLOCK)
+		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+
+	putback_lru_page(page);
+}
+
+/*
+ * Accounting for page isolation fail during munlock
+ *
+ * Performs accounting when page isolation fails in munlock. There is nothing
+ * else to do because it means some other task has already removed the page
+ * from the LRU. putback_lru_page() will take care of removing the page from
+ * the unevictable list, if necessary. vmscan [page_referenced()] will move
+ * the page back to the unevictable list if some other vma has it mlocked.
+ */
+static void __munlock_isolation_failed(struct page *page)
+{
+	if (PageUnevictable(page))
+		count_vm_event(UNEVICTABLE_PGSTRANDED);
+	else
+		count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+}
+
 /**
  * munlock_vma_page - munlock a vma page
  * @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
 		unsigned int nr_pages = hpage_nr_pages(page);
 		mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
 		page_mask = nr_pages - 1;
-		if (!isolate_lru_page(page)) {
-			int ret = SWAP_AGAIN;
-
-			/*
-			 * Optimization: if the page was mapped just once,
-			 * that's our mapping and we don't need to check all the
-			 * other vmas.
-			 */
-			if (page_mapcount(page) > 1)
-				ret = try_to_munlock(page);
-			/*
-			 * did try_to_unlock() succeed or punt?
-			 */
-			if (ret != SWAP_MLOCK)
-				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-
-			putback_lru_page(page);
-		} else {
-			/*
-			 * Some other task has removed the page from the LRU.
-			 * putback_lru_page() will take care of removing the
-			 * page from the unevictable list, if necessary.
-			 * vmscan [page_referenced()] will move the page back
-			 * to the unevictable list if some other vma has it
-			 * mlocked.
-			 */
-			if (PageUnevictable(page))
-				count_vm_event(UNEVICTABLE_PGSTRANDED);
-			else
-				count_vm_event(UNEVICTABLE_PGMUNLOCKED);
-		}
+		if (!isolate_lru_page(page))
+			__munlock_isolated_page(page);
+		else
+			__munlock_isolation_failed(page);
 	}
 
 	return page_mask;
@@ -209,6 +226,73 @@ static int __mlock_posix_error_return(long retval)
 	return retval;
 }
 
+/*
+ * Munlock a batch of pages from the same zone
+ *
+ * The work is split to two main phases. First phase clears the Mlocked flag
+ * and attempts to isolate the pages, all under a single zone lru lock.
+ * The second phase finishes the munlock only for pages where isolation
+ * succeeded.
+ *
+ * Note that pvec is modified during the process. Before returning
+ * pagevec_reinit() is called on it.
+ */
+static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
+{
+	int i;
+	int nr = pagevec_count(pvec);
+
+	/* Phase 1: page isolation */
+	spin_lock_irq(&zone->lru_lock);
+	for (i = 0; i < nr; i++) {
+		struct page *page = pvec->pages[i];
+
+		if (TestClearPageMlocked(page)) {
+			struct lruvec *lruvec;
+			int lru;
+
+			/* we have disabled interrupts */
+			__mod_zone_page_state(zone, NR_MLOCK, -1);
+
+			if (PageLRU(page)) {
+				lruvec = mem_cgroup_page_lruvec(page, zone);
+				lru = page_lru(page);
+
+				get_page(page);
+				ClearPageLRU(page);
+				del_page_from_lru_list(page, lruvec, lru);
+			} else {
+				__munlock_isolation_failed(page);
+				goto skip_munlock;
+			}
+
+		} else {
+skip_munlock:
+			/*
+			 * We won't be munlocking this page in the next phase
+			 * but we still need to release the follow_page_mask()
+			 * pin.
+			 */
+			pvec->pages[i] = NULL;
+			put_page(page);
+		}
+	}
+	spin_unlock_irq(&zone->lru_lock);
+
+	/* Phase 2: page munlock and putback */
+	for (i = 0; i < nr; i++) {
+		struct page *page = pvec->pages[i];
+
+		if (page) {
+			lock_page(page);
+			__munlock_isolated_page(page);
+			unlock_page(page);
+			put_page(page); /* pin from follow_page_mask() */
+		}
+	}
+	pagevec_reinit(pvec);
+}
+
 /*
  * munlock_vma_pages_range() - munlock all pages in the vma range.'
  * @vma - vma containing range to be munlock()ed.
@@ -230,11 +314,16 @@ static int __mlock_posix_error_return(long retval)
 void munlock_vma_pages_range(struct vm_area_struct *vma,
 			     unsigned long start, unsigned long end)
 {
+	struct pagevec pvec;
+	struct zone *zone = NULL;
+
+	pagevec_init(&pvec, 0);
 	vma->vm_flags &= ~VM_LOCKED;
 
 	while (start < end) {
 		struct page *page;
 		unsigned int page_mask, page_increm;
+		struct zone *pagezone;
 
 		/*
 		 * Although FOLL_DUMP is intended for get_dump_page(),
@@ -246,20 +335,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 		page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
 					&page_mask);
 		if (page && !IS_ERR(page)) {
-			lock_page(page);
-			/*
-			 * Any THP page found by follow_page_mask() may have
-			 * gotten split before reaching munlock_vma_page(),
-			 * so we need to recompute the page_mask here.
-			 */
-			page_mask = munlock_vma_page(page);
-			unlock_page(page);
-			put_page(page);
+			pagezone = page_zone(page);
+			/* The whole pagevec must be in the same zone */
+			if (pagezone != zone) {
+				if (pagevec_count(&pvec))
+					__munlock_pagevec(&pvec, zone);
+				zone = pagezone;
+			}
+			if (PageTransHuge(page)) {
+				/*
+				 * THP pages are not handled by pagevec due
+				 * to their possible split (see below).
+				 */
+				if (pagevec_count(&pvec))
+					__munlock_pagevec(&pvec, zone);
+				lock_page(page);
+				/*
+				 * Any THP page found by follow_page_mask() may
+				 * have gotten split before reaching
+				 * munlock_vma_page(), so we need to recompute
+				 * the page_mask here.
+				 */
+				page_mask = munlock_vma_page(page);
+				unlock_page(page);
+				put_page(page); /* follow_page_mask() */
+			} else {
+				/*
+				 * Non-huge pages are handled in batches
+				 * via pagevec. The pin from
+				 * follow_page_mask() prevents them from
+				 * collapsing by THP.
+				 */
+				if (pagevec_add(&pvec, page) == 0)
+					__munlock_pagevec(&pvec, zone);
+			}
 		}
 		page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
 		start += page_increm * PAGE_SIZE;
 		cond_resched();
 	}
+	if (pagevec_count(&pvec))
+		__munlock_pagevec(&pvec, zone);
 }
 
 /*

From 1ebb7cc6a58321a4b22c4c9097b4651b0ab859d0 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:30 -0700
Subject: [PATCH 114/303] mm: munlock: batch NR_MLOCK zone state updates
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Depending on previous batch which introduced batched isolation in
munlock_vma_range(), we can batch also the updates of NR_MLOCK page stats.
 After the whole pagevec is processed for page isolation, the stats are
updated only once with the number of successful isolations.  There were
however no measurable perfomance gains.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index b3b4a78b7802..b1a7c8007c89 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -241,6 +241,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 {
 	int i;
 	int nr = pagevec_count(pvec);
+	int delta_munlocked = -nr;
 
 	/* Phase 1: page isolation */
 	spin_lock_irq(&zone->lru_lock);
@@ -251,9 +252,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			struct lruvec *lruvec;
 			int lru;
 
-			/* we have disabled interrupts */
-			__mod_zone_page_state(zone, NR_MLOCK, -1);
-
 			if (PageLRU(page)) {
 				lruvec = mem_cgroup_page_lruvec(page, zone);
 				lru = page_lru(page);
@@ -275,8 +273,10 @@ skip_munlock:
 			 */
 			pvec->pages[i] = NULL;
 			put_page(page);
+			delta_munlocked++;
 		}
 	}
+	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
 	spin_unlock_irq(&zone->lru_lock);
 
 	/* Phase 2: page munlock and putback */

From 56afe477df3cbbcd656682d0355ef7d9eb8bdd81 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:32 -0700
Subject: [PATCH 115/303] mm: munlock: bypass per-cpu pvec for putback_lru_page
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After introducing batching by pagevecs into munlock_vma_range(), we can
further improve performance by bypassing the copying into per-cpu pagevec
and the get_page/put_page pair associated with that.  Instead we perform
LRU putback directly from our pagevec.  However, this is possible only for
single-mapped pages that are evictable after munlock.  Unevictable pages
require rechecking after putting on the unevictable list, so for those we
fallback to putback_lru_page(), hich handles that.

After this patch, a 13% speedup was measured for munlocking a 56GB large
memory area with THP disabled.

[akpm@linux-foundation.org:clarify comment]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 4 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index b1a7c8007c89..abdc612b042d 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -226,6 +226,52 @@ static int __mlock_posix_error_return(long retval)
 	return retval;
 }
 
+/*
+ * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec()
+ *
+ * The fast path is available only for evictable pages with single mapping.
+ * Then we can bypass the per-cpu pvec and get better performance.
+ * when mapcount > 1 we need try_to_munlock() which can fail.
+ * when !page_evictable(), we need the full redo logic of putback_lru_page to
+ * avoid leaving evictable page in unevictable list.
+ *
+ * In case of success, @page is added to @pvec and @pgrescued is incremented
+ * in case that the page was previously unevictable. @page is also unlocked.
+ */
+static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec,
+		int *pgrescued)
+{
+	VM_BUG_ON(PageLRU(page));
+	VM_BUG_ON(!PageLocked(page));
+
+	if (page_mapcount(page) <= 1 && page_evictable(page)) {
+		pagevec_add(pvec, page);
+		if (TestClearPageUnevictable(page))
+			(*pgrescued)++;
+		unlock_page(page);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Putback multiple evictable pages to the LRU
+ *
+ * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of
+ * the pages might have meanwhile become unevictable but that is OK.
+ */
+static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
+{
+	count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec));
+	/*
+	 *__pagevec_lru_add() calls release_pages() so we don't call
+	 * put_page() explicitly
+	 */
+	__pagevec_lru_add(pvec);
+	count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
+}
+
 /*
  * Munlock a batch of pages from the same zone
  *
@@ -242,6 +288,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 	int i;
 	int nr = pagevec_count(pvec);
 	int delta_munlocked = -nr;
+	struct pagevec pvec_putback;
+	int pgrescued = 0;
 
 	/* Phase 1: page isolation */
 	spin_lock_irq(&zone->lru_lock);
@@ -279,17 +327,34 @@ skip_munlock:
 	__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
 	spin_unlock_irq(&zone->lru_lock);
 
-	/* Phase 2: page munlock and putback */
+	/* Phase 2: page munlock */
+	pagevec_init(&pvec_putback, 0);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pvec->pages[i];
 
 		if (page) {
 			lock_page(page);
-			__munlock_isolated_page(page);
-			unlock_page(page);
-			put_page(page); /* pin from follow_page_mask() */
+			if (!__putback_lru_fast_prepare(page, &pvec_putback,
+					&pgrescued)) {
+				/* Slow path */
+				__munlock_isolated_page(page);
+				unlock_page(page);
+			}
 		}
 	}
+
+	/* Phase 3: page putback for pages that qualified for the fast path */
+	if (pagevec_count(&pvec_putback))
+		__putback_lru_fast(&pvec_putback, pgrescued);
+
+	/* Phase 4: put_page to return pin from follow_page_mask() */
+	for (i = 0; i < nr; i++) {
+		struct page *page = pvec->pages[i];
+
+		if (page)
+			put_page(page);
+	}
+
 	pagevec_reinit(pvec);
 }
 

From 5b40998ae35cf64561868370e6c9f3d3e94b6bf7 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:33 -0700
Subject: [PATCH 116/303] mm: munlock: remove redundant get_page/put_page pair
 on the fast path
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The performance of the fast path in munlock_vma_range() can be further
improved by avoiding atomic ops of a redundant get_page()/put_page() pair.

When calling get_page() during page isolation, we already have the pin
from follow_page_mask().  This pin will be then returned by
__pagevec_lru_add(), after which we do not reference the pages anymore.

After this patch, an 8% speedup was measured for munlocking a 56GB large
memory area with THP disabled.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Reviewed-by: Jörn Engel <joern@logfs.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mlock.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/mm/mlock.c b/mm/mlock.c
index abdc612b042d..19a934dce5d6 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -303,8 +303,10 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 			if (PageLRU(page)) {
 				lruvec = mem_cgroup_page_lruvec(page, zone);
 				lru = page_lru(page);
-
-				get_page(page);
+				/*
+				 * We already have pin from follow_page_mask()
+				 * so we can spare the get_page() here.
+				 */
 				ClearPageLRU(page);
 				del_page_from_lru_list(page, lruvec, lru);
 			} else {
@@ -336,25 +338,25 @@ skip_munlock:
 			lock_page(page);
 			if (!__putback_lru_fast_prepare(page, &pvec_putback,
 					&pgrescued)) {
-				/* Slow path */
+				/*
+				 * Slow path. We don't want to lose the last
+				 * pin before unlock_page()
+				 */
+				get_page(page); /* for putback_lru_page() */
 				__munlock_isolated_page(page);
 				unlock_page(page);
+				put_page(page); /* from follow_page_mask() */
 			}
 		}
 	}
 
-	/* Phase 3: page putback for pages that qualified for the fast path */
+	/*
+	 * Phase 3: page putback for pages that qualified for the fast path
+	 * This will also call put_page() to return pin from follow_page_mask()
+	 */
 	if (pagevec_count(&pvec_putback))
 		__putback_lru_fast(&pvec_putback, pgrescued);
 
-	/* Phase 4: put_page to return pin from follow_page_mask() */
-	for (i = 0; i < nr; i++) {
-		struct page *page = pvec->pages[i];
-
-		if (page)
-			put_page(page);
-	}
-
 	pagevec_reinit(pvec);
 }
 

From 7a8010cd36273ff5f6fea5201ef9232f30cebbd9 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Wed, 11 Sep 2013 14:22:35 -0700
Subject: [PATCH 117/303] mm: munlock: manual pte walk in fast path instead of
 follow_page_mask()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently munlock_vma_pages_range() calls follow_page_mask() to obtain
each individual struct page.  This entails repeated full page table
translations and page table lock taken for each page separately.

This patch avoids the costly follow_page_mask() where possible, by
iterating over ptes within single pmd under single page table lock.  The
first pte is obtained by get_locked_pte() for non-THP page acquired by the
initial follow_page_mask().  The rest of the on-stack pagevec for munlock
is filled up using pte_walk as long as pte_present() and vm_normal_page()
are sufficient to obtain the struct page.

After this patch, a 14% speedup was measured for munlocking a 56GB large
memory area with THP disabled.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jörn Engel <joern@logfs.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michel Lespinasse <walken@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  12 ++---
 mm/mlock.c         | 110 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 85 insertions(+), 37 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dce24569f8fc..03f84b8d7359 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -643,12 +643,12 @@ static inline enum zone_type page_zonenum(const struct page *page)
 #endif
 
 /*
- * The identification function is only used by the buddy allocator for
- * determining if two pages could be buddies. We are not really
- * identifying a zone since we could be using a the section number
- * id if we have not node id available in page flags.
- * We guarantee only that it will return the same value for two
- * combinable pages in a zone.
+ * The identification function is mainly used by the buddy allocator for
+ * determining if two pages could be buddies. We are not really identifying
+ * the zone since we could be using the section number id if we do not have
+ * node id available in page flags.
+ * We only guarantee that it will return the same value for two combinable
+ * pages in a zone.
  */
 static inline int page_zone_id(struct page *page)
 {
diff --git a/mm/mlock.c b/mm/mlock.c
index 19a934dce5d6..d63802663242 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued)
  * The second phase finishes the munlock only for pages where isolation
  * succeeded.
  *
- * Note that pvec is modified during the process. Before returning
- * pagevec_reinit() is called on it.
+ * Note that the pagevec may be modified during the process.
  */
 static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
 {
@@ -356,8 +355,60 @@ skip_munlock:
 	 */
 	if (pagevec_count(&pvec_putback))
 		__putback_lru_fast(&pvec_putback, pgrescued);
+}
 
-	pagevec_reinit(pvec);
+/*
+ * Fill up pagevec for __munlock_pagevec using pte walk
+ *
+ * The function expects that the struct page corresponding to @start address is
+ * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone.
+ *
+ * The rest of @pvec is filled by subsequent pages within the same pmd and same
+ * zone, as long as the pte's are present and vm_normal_page() succeeds. These
+ * pages also get pinned.
+ *
+ * Returns the address of the next page that should be scanned. This equals
+ * @start + PAGE_SIZE when no page could be added by the pte walk.
+ */
+static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
+		struct vm_area_struct *vma, int zoneid,	unsigned long start,
+		unsigned long end)
+{
+	pte_t *pte;
+	spinlock_t *ptl;
+
+	/*
+	 * Initialize pte walk starting at the already pinned page where we
+	 * are sure that there is a pte.
+	 */
+	pte = get_locked_pte(vma->vm_mm, start,	&ptl);
+	end = min(end, pmd_addr_end(start, end));
+
+	/* The page next to the pinned page is the first we will try to get */
+	start += PAGE_SIZE;
+	while (start < end) {
+		struct page *page = NULL;
+		pte++;
+		if (pte_present(*pte))
+			page = vm_normal_page(vma, start, *pte);
+		/*
+		 * Break if page could not be obtained or the page's node+zone does not
+		 * match
+		 */
+		if (!page || page_zone_id(page) != zoneid)
+			break;
+
+		get_page(page);
+		/*
+		 * Increase the address that will be returned *before* the
+		 * eventual break due to pvec becoming full by adding the page
+		 */
+		start += PAGE_SIZE;
+		if (pagevec_add(pvec, page) == 0)
+			break;
+	}
+	pte_unmap_unlock(pte, ptl);
+	return start;
 }
 
 /*
@@ -381,17 +432,16 @@ skip_munlock:
 void munlock_vma_pages_range(struct vm_area_struct *vma,
 			     unsigned long start, unsigned long end)
 {
-	struct pagevec pvec;
-	struct zone *zone = NULL;
-
-	pagevec_init(&pvec, 0);
 	vma->vm_flags &= ~VM_LOCKED;
 
 	while (start < end) {
-		struct page *page;
+		struct page *page = NULL;
 		unsigned int page_mask, page_increm;
-		struct zone *pagezone;
+		struct pagevec pvec;
+		struct zone *zone;
+		int zoneid;
 
+		pagevec_init(&pvec, 0);
 		/*
 		 * Although FOLL_DUMP is intended for get_dump_page(),
 		 * it just so happens that its special treatment of the
@@ -400,22 +450,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 		 * has sneaked into the range, we won't oops here: great).
 		 */
 		page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
-					&page_mask);
+				&page_mask);
+
 		if (page && !IS_ERR(page)) {
-			pagezone = page_zone(page);
-			/* The whole pagevec must be in the same zone */
-			if (pagezone != zone) {
-				if (pagevec_count(&pvec))
-					__munlock_pagevec(&pvec, zone);
-				zone = pagezone;
-			}
 			if (PageTransHuge(page)) {
-				/*
-				 * THP pages are not handled by pagevec due
-				 * to their possible split (see below).
-				 */
-				if (pagevec_count(&pvec))
-					__munlock_pagevec(&pvec, zone);
 				lock_page(page);
 				/*
 				 * Any THP page found by follow_page_mask() may
@@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
 				put_page(page); /* follow_page_mask() */
 			} else {
 				/*
-				 * Non-huge pages are handled in batches
-				 * via pagevec. The pin from
-				 * follow_page_mask() prevents them from
-				 * collapsing by THP.
+				 * Non-huge pages are handled in batches via
+				 * pagevec. The pin from follow_page_mask()
+				 * prevents them from collapsing by THP.
 				 */
-				if (pagevec_add(&pvec, page) == 0)
-					__munlock_pagevec(&pvec, zone);
+				pagevec_add(&pvec, page);
+				zone = page_zone(page);
+				zoneid = page_zone_id(page);
+
+				/*
+				 * Try to fill the rest of pagevec using fast
+				 * pte walk. This will also update start to
+				 * the next page to process. Then munlock the
+				 * pagevec.
+				 */
+				start = __munlock_pagevec_fill(&pvec, vma,
+						zoneid, start, end);
+				__munlock_pagevec(&pvec, zone);
+				goto next;
 			}
 		}
 		page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
 		start += page_increm * PAGE_SIZE;
+next:
 		cond_resched();
 	}
-	if (pagevec_count(&pvec))
-		__munlock_pagevec(&pvec, zone);
 }
 
 /*

From 6e543d5780e36ff5ee56c44d7e2e30db3457a7ed Mon Sep 17 00:00:00 2001
From: Lisa Du <cldu@marvell.com>
Date: Wed, 11 Sep 2013 14:22:36 -0700
Subject: [PATCH 118/303] mm: vmscan: fix do_try_to_free_pages() livelock

This patch is based on KOSAKI's work and I add a little more description,
please refer https://lkml.org/lkml/2012/6/14/74.

Currently, I found system can enter a state that there are lots of free
pages in a zone but only order-0 and order-1 pages which means the zone is
heavily fragmented, then high order allocation could make direct reclaim
path's long stall(ex, 60 seconds) especially in no swap and no compaciton
enviroment.  This problem happened on v3.4, but it seems issue still lives
in current tree, the reason is do_try_to_free_pages enter live lock:

kswapd will go to sleep if the zones have been fully scanned and are still
not balanced.  As kswapd thinks there's little point trying all over again
to avoid infinite loop.  Instead it changes order from high-order to
0-order because kswapd think order-0 is the most important.  Look at
73ce02e9 in detail.  If watermarks are ok, kswapd will go back to sleep
and may leave zone->all_unreclaimable =3D 0.  It assume high-order users
can still perform direct reclaim if they wish.

Direct reclaim continue to reclaim for a high order which is not a
COSTLY_ORDER without oom-killer until kswapd turn on
zone->all_unreclaimble= .  This is because to avoid too early oom-kill.
So it means direct_reclaim depends on kswapd to break this loop.

In worst case, direct-reclaim may continue to page reclaim forever when
kswapd sleeps forever until someone like watchdog detect and finally kill
the process.  As described in:
http://thread.gmane.org/gmane.linux.kernel.mm/103737

We can't turn on zone->all_unreclaimable from direct reclaim path because
direct reclaim path don't take any lock and this way is racy.  Thus this
patch removes zone->all_unreclaimable field completely and recalculates
zone reclaimable state every time.

Note: we can't take the idea that direct-reclaim see zone->pages_scanned
directly and kswapd continue to use zone->all_unreclaimable.  Because, it
is racy.  commit 929bea7c71 (vmscan: all_unreclaimable() use
zone->all_unreclaimable as a name) describes the detail.

[akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()]
Cc: Aaditya Kumar <aaditya.kumar.30@gmail.com>
Cc: Ying Han <yinghan@google.com>
Cc: Nick Piggin <npiggin@gmail.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Bob Liu <lliubbo@gmail.com>
Cc: Neil Zhang <zhangwm@marvell.com>
Cc: Russell King - ARM Linux <linux@arm.linux.org.uk>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Lisa Du <cldu@marvell.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  1 +
 include/linux/mmzone.h    |  1 -
 include/linux/vmstat.h    |  1 -
 mm/internal.h             |  2 ++
 mm/migrate.c              |  2 +-
 mm/page-writeback.c       |  3 ++
 mm/page_alloc.c           |  5 ++-
 mm/vmscan.c               | 66 ++++++++++++++++++---------------------
 mm/vmstat.c               |  5 ++-
 9 files changed, 44 insertions(+), 42 deletions(-)

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 1397ccf81e91..cf55945c83fb 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -2,6 +2,7 @@
 #define LINUX_MM_INLINE_H
 
 #include <linux/huge_mm.h>
+#include <linux/swap.h>
 
 /**
  * page_is_file_cache - should the page be on a file LRU or anon LRU?
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ac1ea796ec0f..bd791e452ad7 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -353,7 +353,6 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
-	int                     all_unreclaimable; /* All pages pinned */
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 502767f4e4d4..e4b948080d20 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 }
 
 extern unsigned long global_reclaimable_pages(void);
-extern unsigned long zone_reclaimable_pages(struct zone *zone);
 
 #ifdef CONFIG_NUMA
 /*
diff --git a/mm/internal.h b/mm/internal.h
index 4390ac6c106e..684f7aa9692a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
+extern bool zone_reclaimable(struct zone *zone);
 
 /*
  * in mm/rmap.c:
diff --git a/mm/migrate.c b/mm/migrate.c
index 61f14a1923fd..b7ded7eafe3a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1471,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 		if (!populated_zone(zone))
 			continue;
 
-		if (zone->all_unreclaimable)
+		if (!zone_reclaimable(zone))
 			continue;
 
 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d374b29296dd..3750431b3cd8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,8 +36,11 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/mm_inline.h>
 #include <trace/events/writeback.h>
 
+#include "internal.h"
+
 /*
  * Sleep at most 200ms at a time in balance_dirty_pages().
  */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7b1b706a1ffa..ff2782576e39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -56,6 +56,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/mm_inline.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
 #include <linux/hugetlb.h>
@@ -647,7 +648,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	int to_free = count;
 
 	spin_lock(&zone->lock);
-	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	while (to_free) {
@@ -696,7 +696,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
-	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
@@ -3164,7 +3163,7 @@ void show_free_areas(unsigned int filter)
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
-			(zone->all_unreclaimable ? "yes" : "no")
+			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 44c072a7cba2..fe715daeb8bc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc)
 }
 #endif
 
+unsigned long zone_reclaimable_pages(struct zone *zone)
+{
+	int nr;
+
+	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
+	     zone_page_state(zone, NR_INACTIVE_FILE);
+
+	if (get_nr_swap_pages() > 0)
+		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
+		      zone_page_state(zone, NR_INACTIVE_ANON);
+
+	return nr;
+}
+
+bool zone_reclaimable(struct zone *zone)
+{
+	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
 static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
 	if (!mem_cgroup_disabled())
@@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && zone->all_unreclaimable)
+	if (current_is_kswapd() && !zone_reclaimable(zone))
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
@@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			if (zone->all_unreclaimable &&
-					sc->priority != DEF_PRIORITY)
+			if (sc->priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;	/* Let kswapd poll it */
 			if (IS_ENABLED(CONFIG_COMPACTION)) {
 				/*
@@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	return aborted_reclaim;
 }
 
-static bool zone_reclaimable(struct zone *zone)
-{
-	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
-}
-
 /* All zones in zonelist are unreclaimable? */
 static bool all_unreclaimable(struct zonelist *zonelist,
 		struct scan_control *sc)
@@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 			continue;
 		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 			continue;
-		if (!zone->all_unreclaimable)
+		if (zone_reclaimable(zone))
 			return false;
 	}
 
@@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
-		if (zone->all_unreclaimable) {
+		if (!zone_reclaimable(zone)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
@@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone,
 			       unsigned long lru_pages,
 			       unsigned long *nr_attempted)
 {
-	unsigned long nr_slab;
 	int testorder = sc->order;
 	unsigned long balance_gap;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	shrink_zone(zone, sc);
 
 	reclaim_state->reclaimed_slab = 0;
-	nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
+	shrink_slab(&shrink, sc->nr_scanned, lru_pages);
 	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
 
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
 
-	if (nr_slab == 0 && !zone_reclaimable(zone))
-		zone->all_unreclaimable = 1;
-
 	zone_clear_flag(zone, ZONE_WRITEBACK);
 
 	/*
@@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
-	if (!zone->all_unreclaimable &&
+	if (zone_reclaimable(zone) &&
 	    zone_balanced(zone, testorder, 0, classzone_idx)) {
 		zone_clear_flag(zone, ZONE_CONGESTED);
 		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			/*
@@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			sc.nr_scanned = 0;
@@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void)
 	return nr;
 }
 
-unsigned long zone_reclaimable_pages(struct zone *zone)
-{
-	int nr;
-
-	nr = zone_page_state(zone, NR_ACTIVE_FILE) +
-	     zone_page_state(zone, NR_INACTIVE_FILE);
-
-	if (get_nr_swap_pages() > 0)
-		nr += zone_page_state(zone, NR_ACTIVE_ANON) +
-		      zone_page_state(zone, NR_INACTIVE_ANON);
-
-	return nr;
-}
-
 #ifdef CONFIG_HIBERNATION
 /*
  * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
@@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 
-	if (zone->all_unreclaimable)
+	if (!zone_reclaimable(zone))
 		return ZONE_RECLAIM_FULL;
 
 	/*
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d57a09143bf9..9bb314577911 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -19,6 +19,9 @@
 #include <linux/math64.h>
 #include <linux/writeback.h>
 #include <linux/compaction.h>
+#include <linux/mm_inline.h>
+
+#include "internal.h"
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
@@ -1088,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 		   "\n  all_unreclaimable: %u"
 		   "\n  start_pfn:         %lu"
 		   "\n  inactive_ratio:    %u",
-		   zone->all_unreclaimable,
+		   !zone_reclaimable(zone),
 		   zone->zone_start_pfn,
 		   zone->inactive_ratio);
 	seq_putc(m, '\n');

From 187320932dcece9c4b93f38f56d1f888bd5c325f Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:38 -0700
Subject: [PATCH 119/303] mm/sparse: introduce alloc_usemap_and_memmap

After commit 9bdac9142407 ("sparsemem: Put mem map for one node
together."), vmemmap for one node will be allocated together, its logic
is similar as memory allocation for pageblock flags.  This patch
introduces alloc_usemap_and_memmap to extract the same logic of memory
alloction for pageblock flags and vmemmap.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/sparse.c | 147 +++++++++++++++++++++++-----------------------------
 1 file changed, 64 insertions(+), 83 deletions(-)

diff --git a/mm/sparse.c b/mm/sparse.c
index 308d50331bc3..4ac1d7ef548f 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
 }
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
-static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map,
+static void __init sparse_early_usemaps_alloc_node(void *data,
 				 unsigned long pnum_begin,
 				 unsigned long pnum_end,
 				 unsigned long usemap_count, int nodeid)
 {
 	void *usemap;
 	unsigned long pnum;
+	unsigned long **usemap_map = (unsigned long **)data;
 	int size = usemap_size();
 
 	usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid),
@@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
 #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-static void __init sparse_early_mem_maps_alloc_node(struct page **map_map,
+static void __init sparse_early_mem_maps_alloc_node(void *data,
 				 unsigned long pnum_begin,
 				 unsigned long pnum_end,
 				 unsigned long map_count, int nodeid)
 {
+	struct page **map_map = (struct page **)data;
 	sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end,
 					 map_count, nodeid);
 }
@@ -460,88 +462,18 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
 {
 }
 
-/*
- * Allocate the accumulated non-linear sections, allocate a mem_map
- * for each and record the physical to section mapping.
+/**
+ *  alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap
+ *  @map: usemap_map for pageblock flags or mmap_map for vmemmap
  */
-void __init sparse_init(void)
+static void __init alloc_usemap_and_memmap(void (*alloc_func)
+					(void *, unsigned long, unsigned long,
+					unsigned long, int), void *data)
 {
 	unsigned long pnum;
-	struct page *map;
-	unsigned long *usemap;
-	unsigned long **usemap_map;
-	int size;
+	unsigned long map_count;
 	int nodeid_begin = 0;
 	unsigned long pnum_begin = 0;
-	unsigned long usemap_count;
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-	unsigned long map_count;
-	int size2;
-	struct page **map_map;
-#endif
-
-	/* see include/linux/mmzone.h 'struct mem_section' definition */
-	BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
-
-	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
-	set_pageblock_order();
-
-	/*
-	 * map is using big page (aka 2M in x86 64 bit)
-	 * usemap is less one page (aka 24 bytes)
-	 * so alloc 2M (with 2M align) and 24 bytes in turn will
-	 * make next 2M slip to one more 2M later.
-	 * then in big system, the memory will have a lot of holes...
-	 * here try to allocate 2M pages continuously.
-	 *
-	 * powerpc need to call sparse_init_one_section right after each
-	 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
-	 */
-	size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
-	usemap_map = alloc_bootmem(size);
-	if (!usemap_map)
-		panic("can not allocate usemap_map\n");
-
-	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
-		struct mem_section *ms;
-
-		if (!present_section_nr(pnum))
-			continue;
-		ms = __nr_to_section(pnum);
-		nodeid_begin = sparse_early_nid(ms);
-		pnum_begin = pnum;
-		break;
-	}
-	usemap_count = 1;
-	for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) {
-		struct mem_section *ms;
-		int nodeid;
-
-		if (!present_section_nr(pnum))
-			continue;
-		ms = __nr_to_section(pnum);
-		nodeid = sparse_early_nid(ms);
-		if (nodeid == nodeid_begin) {
-			usemap_count++;
-			continue;
-		}
-		/* ok, we need to take cake of from pnum_begin to pnum - 1*/
-		sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum,
-						 usemap_count, nodeid_begin);
-		/* new start, update count etc*/
-		nodeid_begin = nodeid;
-		pnum_begin = pnum;
-		usemap_count = 1;
-	}
-	/* ok, last chunk */
-	sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS,
-					 usemap_count, nodeid_begin);
-
-#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
-	size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
-	map_map = alloc_bootmem(size2);
-	if (!map_map)
-		panic("can not allocate map_map\n");
 
 	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
 		struct mem_section *ms;
@@ -567,16 +499,65 @@ void __init sparse_init(void)
 			continue;
 		}
 		/* ok, we need to take cake of from pnum_begin to pnum - 1*/
-		sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum,
-						 map_count, nodeid_begin);
+		alloc_func(data, pnum_begin, pnum,
+						map_count, nodeid_begin);
 		/* new start, update count etc*/
 		nodeid_begin = nodeid;
 		pnum_begin = pnum;
 		map_count = 1;
 	}
 	/* ok, last chunk */
-	sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS,
-					 map_count, nodeid_begin);
+	alloc_func(data, pnum_begin, NR_MEM_SECTIONS,
+						map_count, nodeid_begin);
+}
+
+/*
+ * Allocate the accumulated non-linear sections, allocate a mem_map
+ * for each and record the physical to section mapping.
+ */
+void __init sparse_init(void)
+{
+	unsigned long pnum;
+	struct page *map;
+	unsigned long *usemap;
+	unsigned long **usemap_map;
+	int size;
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+	int size2;
+	struct page **map_map;
+#endif
+
+	/* see include/linux/mmzone.h 'struct mem_section' definition */
+	BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
+
+	/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
+	set_pageblock_order();
+
+	/*
+	 * map is using big page (aka 2M in x86 64 bit)
+	 * usemap is less one page (aka 24 bytes)
+	 * so alloc 2M (with 2M align) and 24 bytes in turn will
+	 * make next 2M slip to one more 2M later.
+	 * then in big system, the memory will have a lot of holes...
+	 * here try to allocate 2M pages continuously.
+	 *
+	 * powerpc need to call sparse_init_one_section right after each
+	 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
+	 */
+	size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
+	usemap_map = alloc_bootmem(size);
+	if (!usemap_map)
+		panic("can not allocate usemap_map\n");
+	alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node,
+							(void *)usemap_map);
+
+#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER
+	size2 = sizeof(struct page *) * NR_MEM_SECTIONS;
+	map_map = alloc_bootmem(size2);
+	if (!map_map)
+		panic("can not allocate map_map\n");
+	alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node,
+							(void *)map_map);
 #endif
 
 	for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {

From 7d9f073b8da45a894bb7148433bd84d21eed6757 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:40 -0700
Subject: [PATCH 120/303] mm/writeback: make writeback_inodes_wb static

It's not used globally and could be static.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c         | 2 +-
 include/linux/writeback.h | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 87d778118027..54b3c31c2f0d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -723,7 +723,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 	return wrote;
 }
 
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 				enum wb_reason reason)
 {
 	struct wb_writeback_work work = {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 4e198ca1f685..021b8a319b9e 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -98,8 +98,6 @@ int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
-long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
-				enum wb_reason reason);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 

From 762216ab4e175f49d17bc7ad778c57b9028184e6 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:42 -0700
Subject: [PATCH 121/303] mm/vmalloc: use wrapper function get_vm_area_size to
 caculate size of vm area

Use wrapper function get_vm_area_size to calculate size of vm area.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 93d3182c3300..107454312d5e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1258,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
 	unsigned long addr = (unsigned long)area->addr;
-	unsigned long end = addr + area->size - PAGE_SIZE;
+	unsigned long end = addr + get_vm_area_size(area);
 	int err;
 
 	err = vmap_page_range(addr, end, prot, *pages);
@@ -1553,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	unsigned int nr_pages, array_size, i;
 	gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 
-	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
 
 	area->nr_pages = nr_pages;
@@ -1985,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count)
 
 		vm = va->vm;
 		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + vm->size - PAGE_SIZE)
+		if (addr >= vaddr + get_vm_area_size(vm))
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -1995,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + vm->size - PAGE_SIZE - addr;
+		n = vaddr + get_vm_area_size(vm) - addr;
 		if (n > count)
 			n = count;
 		if (!(vm->flags & VM_IOREMAP))
@@ -2067,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 
 		vm = va->vm;
 		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + vm->size - PAGE_SIZE)
+		if (addr >= vaddr + get_vm_area_size(vm))
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -2076,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + vm->size - PAGE_SIZE - addr;
+		n = vaddr + get_vm_area_size(vm) - addr;
 		if (n > count)
 			n = count;
 		if (!(vm->flags & VM_IOREMAP)) {

From 1ecfd533f4c528b0b4cc5bc115c4c47f0b5e4828 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 11 Sep 2013 14:22:43 -0700
Subject: [PATCH 122/303] mm/mremap.c: call pud_free() after fail calling
 pmd_alloc()

In alloc_new_pmd(), if pud_alloc() was called successfully, but
pmd_alloc() fails, avoid leaking `pud'.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/mremap.c b/mm/mremap.c
index 0843feb66f3d..91b13d6a16d4 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,6 +25,7 @@
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
+#include <asm/pgalloc.h>
 
 #include "internal.h"
 
@@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 		return NULL;
 
 	pmd = pmd_alloc(mm, pud, addr);
-	if (!pmd)
+	if (!pmd) {
+		pud_free(mm, pud);
 		return NULL;
+	}
 
 	VM_BUG_ON(pmd_trans_huge(*pmd));
 

From 4c3bffc272755c98728c2b58b1a8148cf9e9fd1f Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 11 Sep 2013 14:22:44 -0700
Subject: [PATCH 123/303] mm/backing-dev.c: check user buffer length before
 copying data to the related user buffer

'*lenp' may be less than "sizeof(kbuf)" so we must check this before the
next copy_to_user().

pdflush_proc_obsolete() is called by sysctl which 'procname' is
"nr_pdflush_threads", if the user passes buffer length less than
"sizeof(kbuf)", it will cause issue.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/backing-dev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 37d9edcd14cf..ce682f7a4f29 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write,
 {
 	char kbuf[] = "0\n";
 
-	if (*ppos) {
+	if (*ppos || *lenp < sizeof(kbuf)) {
 		*lenp = 0;
 		return 0;
 	}

From 5a53748568f79641eaf40e41081a2f4987f005c2 Mon Sep 17 00:00:00 2001
From: Maxim Patlasov <mpatlasov@parallels.com>
Date: Wed, 11 Sep 2013 14:22:46 -0700
Subject: [PATCH 124/303] mm/page-writeback.c: add strictlimit feature

The feature prevents mistrusted filesystems (ie: FUSE mounts created by
unprivileged users) to grow a large number of dirty pages before
throttling.  For such filesystems balance_dirty_pages always check bdi
counters against bdi limits.  I.e.  even if global "nr_dirty" is under
"freerun", it's not allowed to skip bdi checks.  The only use case for now
is fuse: it sets bdi max_ratio to 1% by default and system administrators
are supposed to expect that this limit won't be exceeded.

The feature is on if a BDI is marked by BDI_CAP_STRICTLIMIT flag.  A
filesystem may set the flag when it initializes its BDI.

The problematic scenario comes from the fact that nobody pays attention to
the NR_WRITEBACK_TEMP counter (i.e.  number of pages under fuse
writeback).  The implementation of fuse writeback releases original page
(by calling end_page_writeback) almost immediately.  A fuse request queued
for real processing bears a copy of original page.  Hence, if userspace
fuse daemon doesn't finalize write requests in timely manner, an
aggressive mmap writer can pollute virtually all memory by those temporary
fuse page copies.  They are carefully accounted in NR_WRITEBACK_TEMP, but
nobody cares.

To make further explanations shorter, let me use "NR_WRITEBACK_TEMP
problem" as a shortcut for "a possibility of uncontrolled grow of amount
of RAM consumed by temporary pages allocated by kernel fuse to process
writeback".

The problem was very easy to reproduce.  There is a trivial example
filesystem implementation in fuse userspace distribution: fusexmp_fh.c.  I
added "sleep(1);" to the write methods, then recompiled and mounted it.
Then created a huge file on the mount point and run a simple program which
mmap-ed the file to a memory region, then wrote a data to the region.  An
hour later I observed almost all RAM consumed by fuse writeback.  Since
then some unrelated changes in kernel fuse made it more difficult to
reproduce, but it is still possible now.

Putting this theoretical happens-in-the-lab thing aside, there is another
thing that really hurts real world (FUSE) users.  This is write-through
page cache policy FUSE currently uses.  I.e.  handling write(2), kernel
fuse populates page cache and flushes user data to the server
synchronously.  This is excessively suboptimal.  Pavel Emelyanov's patches
("writeback cache policy") solve the problem, but they also make resolving
NR_WRITEBACK_TEMP problem absolutely necessary.  Otherwise, simply copying
a huge file to a fuse mount would result in memory starvation.  Miklos,
the maintainer of FUSE, believes strictlimit feature the way to go.

And eventually putting FUSE topics aside, there is one more use-case for
strictlimit feature.  Using a slow USB stick (mass storage) in a machine
with huge amount of RAM installed is a well-known pain.  Let's make simple
computations.  Assuming 64GB of RAM installed, existing implementation of
balance_dirty_pages will start throttling only after 9.6GB of RAM becomes
dirty (freerun == 15% of total RAM).  So, the command "cp 9GB_file
/media/my-usb-storage/" may return in a few seconds, but subsequent
"umount /media/my-usb-storage/" will take more than two hours if effective
throughput of the storage is, to say, 1MB/sec.

After inclusion of strictlimit feature, it will be trivial to add a knob
(e.g.  /sys/devices/virtual/bdi/x:y/strictlimit) to enable it on demand.
Manually or via udev rule.  May be I'm wrong, but it seems to be quite a
natural desire to limit the amount of dirty memory for some devices we are
not fully trust (in the sense of sustainable throughput).

[akpm@linux-foundation.org: fix warning in page-writeback.c]
Signed-off-by: Maxim Patlasov <MPatlasov@parallels.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fuse/inode.c             |   2 +-
 include/linux/backing-dev.h |   3 +
 mm/page-writeback.c         | 265 +++++++++++++++++++++++++++---------
 3 files changed, 207 insertions(+), 63 deletions(-)

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index e0fe703ee3d6..84434594e80e 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	fc->bdi.name = "fuse";
 	fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	/* fuse does it's own writeback accounting */
-	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB;
+	fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
 
 	err = bdi_init(&fc->bdi);
 	if (err)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c3881553f7d1..5f66d519a726 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_EXEC_MAP:       Can be mapped for execution
  *
  * BDI_CAP_SWAP_BACKED:    Count shmem/tmpfs objects as swap-backed.
+ *
+ * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
  */
 #define BDI_CAP_NO_ACCT_DIRTY	0x00000001
 #define BDI_CAP_NO_WRITEBACK	0x00000002
@@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
 #define BDI_CAP_NO_ACCT_WB	0x00000080
 #define BDI_CAP_SWAP_BACKED	0x00000100
 #define BDI_CAP_STABLE_WRITES	0x00000200
+#define BDI_CAP_STRICTLIMIT	0x00000400
 
 #define BDI_CAP_VMFLAGS \
 	(BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3750431b3cd8..6c7b0187be8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -584,6 +584,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 	return bdi_dirty;
 }
 
+/*
+ *                           setpoint - dirty 3
+ *        f(dirty) := 1.0 + (----------------)
+ *                           limit - setpoint
+ *
+ * it's a 3rd order polynomial that subjects to
+ *
+ * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
+ * (2) f(setpoint) = 1.0 => the balance point
+ * (3) f(limit)    = 0   => the hard limit
+ * (4) df/dx      <= 0	 => negative feedback control
+ * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
+ *     => fast response on large errors; small oscillation near setpoint
+ */
+static inline long long pos_ratio_polynom(unsigned long setpoint,
+					  unsigned long dirty,
+					  unsigned long limit)
+{
+	long long pos_ratio;
+	long x;
+
+	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
+		    limit - setpoint + 1);
+	pos_ratio = x;
+	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
+	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+
+	return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT);
+}
+
 /*
  * Dirty position control.
  *
@@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 	/*
 	 * global setpoint
 	 *
-	 *                           setpoint - dirty 3
-	 *        f(dirty) := 1.0 + (----------------)
-	 *                           limit - setpoint
-	 *
-	 * it's a 3rd order polynomial that subjects to
-	 *
-	 * (1) f(freerun)  = 2.0 => rampup dirty_ratelimit reasonably fast
-	 * (2) f(setpoint) = 1.0 => the balance point
-	 * (3) f(limit)    = 0   => the hard limit
-	 * (4) df/dx      <= 0	 => negative feedback control
-	 * (5) the closer to setpoint, the smaller |df/dx| (and the reverse)
-	 *     => fast response on large errors; small oscillation near setpoint
+	 * See comment for pos_ratio_polynom().
 	 */
 	setpoint = (freerun + limit) / 2;
-	x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
-		    limit - setpoint + 1);
-	pos_ratio = x;
-	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
-	pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
-	pos_ratio += 1 << RATELIMIT_CALC_SHIFT;
+	pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+
+	/*
+	 * The strictlimit feature is a tool preventing mistrusted filesystems
+	 * from growing a large number of dirty pages before throttling. For
+	 * such filesystems balance_dirty_pages always checks bdi counters
+	 * against bdi limits. Even if global "nr_dirty" is under "freerun".
+	 * This is especially important for fuse which sets bdi->max_ratio to
+	 * 1% by default. Without strictlimit feature, fuse writeback may
+	 * consume arbitrary amount of RAM because it is accounted in
+	 * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
+	 *
+	 * Here, in bdi_position_ratio(), we calculate pos_ratio based on
+	 * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+	 * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
+	 * limits are set by default to 10% and 20% (background and throttle).
+	 * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+	 * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
+	 * about ~6K pages (as the average of background and throttle bdi
+	 * limits). The 3rd order polynomial will provide positive feedback if
+	 * bdi_dirty is under bdi_setpoint and vice versa.
+	 *
+	 * Note, that we cannot use global counters in these calculations
+	 * because we want to throttle process writing to a strictlimit BDI
+	 * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
+	 * in the example above).
+	 */
+	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+		long long bdi_pos_ratio;
+		unsigned long bdi_bg_thresh;
+
+		if (bdi_dirty < 8)
+			return min_t(long long, pos_ratio * 2,
+				     2 << RATELIMIT_CALC_SHIFT);
+
+		if (bdi_dirty >= bdi_thresh)
+			return 0;
+
+		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+		bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
+						     bdi_bg_thresh);
+
+		if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
+			return 0;
+
+		bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
+						  bdi_thresh);
+
+		/*
+		 * Typically, for strictlimit case, bdi_setpoint << setpoint
+		 * and pos_ratio >> bdi_pos_ratio. In the other words global
+		 * state ("dirty") is not limiting factor and we have to
+		 * make decision based on bdi counters. But there is an
+		 * important case when global pos_ratio should get precedence:
+		 * global limits are exceeded (e.g. due to activities on other
+		 * BDIs) while given strictlimit BDI is below limit.
+		 *
+		 * "pos_ratio * bdi_pos_ratio" would work for the case above,
+		 * but it would look too non-natural for the case of all
+		 * activity in the system coming from a single strictlimit BDI
+		 * with bdi->max_ratio == 100%.
+		 *
+		 * Note that min() below somewhat changes the dynamics of the
+		 * control system. Normally, pos_ratio value can be well over 3
+		 * (when globally we are at freerun and bdi is well below bdi
+		 * setpoint). Now the maximum pos_ratio in the same situation
+		 * is 2. We might want to tweak this if we observe the control
+		 * system is too slow to adapt.
+		 */
+		return min(pos_ratio, bdi_pos_ratio);
+	}
 
 	/*
 	 * We have computed basic pos_ratio above based on global situation. If
@@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	 * keep that period small to reduce time lags).
 	 */
 	step = 0;
+
+	/*
+	 * For strictlimit case, calculations above were based on bdi counters
+	 * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+	 * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
+	 * Hence, to calculate "step" properly, we have to use bdi_dirty as
+	 * "dirty" and bdi_setpoint as "setpoint".
+	 *
+	 * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
+	 * it's possible that bdi_thresh is close to zero due to inactivity
+	 * of backing device (see the implementation of bdi_dirty_limit()).
+	 */
+	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+		dirty = bdi_dirty;
+		if (bdi_dirty < 8)
+			setpoint = bdi_dirty + 1;
+		else
+			setpoint = (bdi_thresh +
+				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
+	}
+
 	if (dirty < setpoint) {
 		x = min(bdi->balanced_dirty_ratelimit,
 			 min(balanced_dirty_ratelimit, task_ratelimit));
@@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
 	return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
 
+static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
+				    unsigned long dirty_thresh,
+				    unsigned long background_thresh,
+				    unsigned long *bdi_dirty,
+				    unsigned long *bdi_thresh,
+				    unsigned long *bdi_bg_thresh)
+{
+	unsigned long bdi_reclaimable;
+
+	/*
+	 * bdi_thresh is not treated as some limiting factor as
+	 * dirty_thresh, due to reasons
+	 * - in JBOD setup, bdi_thresh can fluctuate a lot
+	 * - in a system with HDD and USB key, the USB key may somehow
+	 *   go into state (bdi_dirty >> bdi_thresh) either because
+	 *   bdi_dirty starts high, or because bdi_thresh drops low.
+	 *   In this case we don't want to hard throttle the USB key
+	 *   dirtiers for 100 seconds until bdi_dirty drops under
+	 *   bdi_thresh. Instead the auxiliary bdi control line in
+	 *   bdi_position_ratio() will let the dirtier task progress
+	 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+	 */
+	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+
+	if (bdi_bg_thresh)
+		*bdi_bg_thresh = div_u64((u64)*bdi_thresh *
+					 background_thresh,
+					 dirty_thresh);
+
+	/*
+	 * In order to avoid the stacked BDI deadlock we need
+	 * to ensure we accurately count the 'dirty' pages when
+	 * the threshold is low.
+	 *
+	 * Otherwise it would be possible to get thresh+n pages
+	 * reported dirty, even though there are thresh-m pages
+	 * actually dirty; with m+n sitting in the percpu
+	 * deltas.
+	 */
+	if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
+		bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
+		*bdi_dirty = bdi_reclaimable +
+			bdi_stat_sum(bdi, BDI_WRITEBACK);
+	} else {
+		bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+		*bdi_dirty = bdi_reclaimable +
+			bdi_stat(bdi, BDI_WRITEBACK);
+	}
+}
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 				unsigned long pages_dirtied)
 {
 	unsigned long nr_reclaimable;	/* = file_dirty + unstable_nfs */
-	unsigned long bdi_reclaimable;
 	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
-	unsigned long bdi_dirty;
-	unsigned long freerun;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
-	unsigned long bdi_thresh;
 	long period;
 	long pause;
 	long max_pause;
@@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long dirty_ratelimit;
 	unsigned long pos_ratio;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
 	unsigned long start_time = jiffies;
 
 	for (;;) {
 		unsigned long now = jiffies;
+		unsigned long uninitialized_var(bdi_thresh);
+		unsigned long thresh;
+		unsigned long uninitialized_var(bdi_dirty);
+		unsigned long dirty;
+		unsigned long bg_thresh;
 
 		/*
 		 * Unstable writes are a feature of certain networked
@@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping,
 
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 
+		if (unlikely(strictlimit)) {
+			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+					 &bdi_dirty, &bdi_thresh, &bg_thresh);
+
+			dirty = bdi_dirty;
+			thresh = bdi_thresh;
+		} else {
+			dirty = nr_dirty;
+			thresh = dirty_thresh;
+			bg_thresh = background_thresh;
+		}
+
 		/*
 		 * Throttle it only when the background writeback cannot
 		 * catch-up. This avoids (excessively) small writeouts
-		 * when the bdi limits are ramping up.
+		 * when the bdi limits are ramping up in case of !strictlimit.
+		 *
+		 * In strictlimit case make decision based on the bdi counters
+		 * and limits. Small writeouts when the bdi limits are ramping
+		 * up are the price we consciously pay for strictlimit-ing.
 		 */
-		freerun = dirty_freerun_ceiling(dirty_thresh,
-						background_thresh);
-		if (nr_dirty <= freerun) {
+		if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
 			current->dirty_paused_when = now;
 			current->nr_dirtied = 0;
 			current->nr_dirtied_pause =
-				dirty_poll_interval(nr_dirty, dirty_thresh);
+				dirty_poll_interval(dirty, thresh);
 			break;
 		}
 
 		if (unlikely(!writeback_in_progress(bdi)))
 			bdi_start_background_writeback(bdi);
 
-		/*
-		 * bdi_thresh is not treated as some limiting factor as
-		 * dirty_thresh, due to reasons
-		 * - in JBOD setup, bdi_thresh can fluctuate a lot
-		 * - in a system with HDD and USB key, the USB key may somehow
-		 *   go into state (bdi_dirty >> bdi_thresh) either because
-		 *   bdi_dirty starts high, or because bdi_thresh drops low.
-		 *   In this case we don't want to hard throttle the USB key
-		 *   dirtiers for 100 seconds until bdi_dirty drops under
-		 *   bdi_thresh. Instead the auxiliary bdi control line in
-		 *   bdi_position_ratio() will let the dirtier task progress
-		 *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
-		 */
-		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
-		/*
-		 * In order to avoid the stacked BDI deadlock we need
-		 * to ensure we accurately count the 'dirty' pages when
-		 * the threshold is low.
-		 *
-		 * Otherwise it would be possible to get thresh+n pages
-		 * reported dirty, even though there are thresh-m pages
-		 * actually dirty; with m+n sitting in the percpu
-		 * deltas.
-		 */
-		if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
-			bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_dirty = bdi_reclaimable +
-				    bdi_stat_sum(bdi, BDI_WRITEBACK);
-		} else {
-			bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_dirty = bdi_reclaimable +
-				    bdi_stat(bdi, BDI_WRITEBACK);
-		}
+		if (!strictlimit)
+			bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
+					 &bdi_dirty, &bdi_thresh, NULL);
 
 		dirty_exceeded = (bdi_dirty > bdi_thresh) &&
-				  (nr_dirty > dirty_thresh);
+				 ((nr_dirty > dirty_thresh) || strictlimit);
 		if (dirty_exceeded && !bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 

From fa0f281cf9de8e6877e6536f18a3fc77368df64d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Wed, 11 Sep 2013 14:22:47 -0700
Subject: [PATCH 125/303] mm: make sure _PAGE_SWP_SOFT_DIRTY bit is not set on
 present pte

_PAGE_SOFT_DIRTY bit should never be set on present pte so add VM_BUG_ON
to catch any potential future abuse.

Also add a comment on _PAGE_SWP_SOFT_DIRTY definition explaining scope of
its usage.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/pgtable.h       | 34 ++++++++++++++++------------
 arch/x86/include/asm/pgtable_types.h |  3 +++
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 8d16befdec88..3d1999458709 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
-static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
-}
-
-static inline int pte_swp_soft_dirty(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
-}
-
-static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
-}
-
 static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
 {
 	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
@@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr);
 
 #ifndef __ASSEMBLY__
 #include <linux/mm_types.h>
+#include <linux/mmdebug.h>
 #include <linux/log2.h>
 
 static inline int pte_none(pte_t pte)
@@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
 {
 }
 
+static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
+{
+	VM_BUG_ON(pte_present(pte));
+	return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
+static inline int pte_swp_soft_dirty(pte_t pte)
+{
+	VM_BUG_ON(pte_present(pte));
+	return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY;
+}
+
+static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
+{
+	VM_BUG_ON(pte_present(pte));
+	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
+}
+
 #include <asm-generic/pgtable.h>
 #endif	/* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index f4843e031131..0ecac257fb26 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -75,6 +75,9 @@
  * with swap entry format. On x86 bits 6 and 7 are *not* involved
  * into swap entry computation, but bit 6 is used for nonlinear
  * file mapping, so we borrow bit 7 for soft dirty tracking.
+ *
+ * Please note that this bit must be treated as swap dirty page
+ * mark if and only if the PTE has present bit clear!
  */
 #ifdef CONFIG_MEM_SOFT_DIRTY
 #define _PAGE_SWP_SOFT_DIRTY	_PAGE_PSE

From cf6fe945389e674130dc7564392930cf7fb9f5e8 Mon Sep 17 00:00:00 2001
From: Wang Sheng-Hui <shhuiw@gmail.com>
Date: Wed, 11 Sep 2013 14:22:48 -0700
Subject: [PATCH 126/303] mm: correct the comment about the value for buddy
 _mapcount

Set _mapcount PAGE_BUDDY_MAPCOUNT_VALUE to make the page buddy.  Not the
magic number -2.

Signed-off-by: Wang Sheng-Hui <shhuiw@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ff2782576e39..0ee638f76ebe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -489,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order)
  * (c) a page and its buddy have the same order &&
  * (d) a page and its buddy are in the same zone.
  *
- * For recording whether a page is in the buddy system, we set ->_mapcount -2.
- * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock.
+ * For recording whether a page is in the buddy system, we set ->_mapcount
+ * PAGE_BUDDY_MAPCOUNT_VALUE.
+ * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
+ * serialized by zone->lock.
  *
  * For recording page's order, we use page_private(page).
  */
@@ -528,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
  * as necessary, plus some accounting needed to play nicely with other
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
- * free pages of length of (1 << order) and marked with _mapcount -2. Page's
- * order is recorded in page_private(page) field.
+ * free pages of length of (1 << order) and marked with _mapcount
+ * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
+ * field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were
  * free, the remainder of the region must be split into blocks.

From 0d6fdbdb2a651f0c9bb979e1d92b1e15dadffc4f Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Wed, 11 Sep 2013 14:22:49 -0700
Subject: [PATCH 127/303] hwpoison: always unset MIGRATE_ISOLATE before
 returning from soft_offline_page()

Soft offline code expects that MIGRATE_ISOLATE is set on the target page
only during soft offlining work.  But currenly it doesn't work as expected
when get_any_page() fails and returns negative value.  In the result, end
users can have unexpectedly isolated pages.  This patch just fixes it.

Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e05ed31c0f61..c8cc57ed7dcd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1553,7 +1553,7 @@ int soft_offline_page(struct page *page, int flags)
 
 	ret = get_any_page(page, pfn, flags);
 	if (ret < 0)
-		return ret;
+		goto unset;
 	if (ret) { /* for in-use pages */
 		if (PageHuge(page))
 			ret = soft_offline_huge_page(page, flags);
@@ -1570,6 +1570,7 @@ int soft_offline_page(struct page *page, int flags)
 			atomic_long_inc(&num_poisoned_pages);
 		}
 	}
+unset:
 	unset_migratetype_isolate(page, MIGRATE_MOVABLE);
 	return ret;
 }

From 841fcc583f81c632d20a27e17beccb20320530a1 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:50 -0700
Subject: [PATCH 128/303] mm/hwpoison: fix loss of PG_dirty for errors on
 mlocked pages

memory_failure() store the page flag of the error page before doing unmap,
and (only) if the first check with page flags at the time decided the
error page is unknown, it do the second check with the stored page flag
since memory_failure() does unmapping of the error pages before doing
page_action().  This unmapping changes the page state, especially
page_remove_rmap() (called from try_to_unmap_one()) clears PG_mlocked, so
page_action() can't catch mlocked pages after that.

However, memory_failure() can't handle memory errors on dirty mlocked
pages correctly.  try_to_unmap_one will move the dirty bit from pte to the
physical page, the second check lose it since it check the stored page
flag.  This patch fix it by restore PG_dirty flag to stored page flag if
the page is dirty.

Testcase:

#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <errno.h>

#define PAGES_TO_TEST 2
#define PAGE_SIZE	4096

int main(void)
{
	char *mem;
	int i;

	mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE,
			PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0, 0);

	for (i = 0; i < PAGES_TO_TEST; i++)
		mem[i * PAGE_SIZE] = 'a';

	if (madvise(mem, PAGES_TO_TEST * PAGE_SIZE, MADV_HWPOISON) == -1)
		return -1;

	return 0;
}

Before patch:

[  912.839247] Injecting memory failure for page 7dfb8 at 7f6b4e37b000
[  912.839257] MCE 0x7dfb8: clean mlocked LRU page recovery: Recovered
[  912.845550] MCE 0x7dfb8: clean mlocked LRU page still referenced by 1 users
[  912.852586] Injecting memory failure for page 7e6aa at 7f6b4e37c000
[  912.852594] MCE 0x7e6aa: clean mlocked LRU page recovery: Recovered
[  912.858936] MCE 0x7e6aa: clean mlocked LRU page still referenced by 1 users

After patch:

[  163.590225] Injecting memory failure for page 91bc2f at 7f9f5b0e5000
[  163.590264] MCE 0x91bc2f: dirty mlocked LRU page recovery: Recovered
[  163.596680] MCE 0x91bc2f: dirty mlocked LRU page still referenced by 1 users
[  163.603831] Injecting memory failure for page 91cdd3 at 7f9f5b0e6000
[  163.603852] MCE 0x91cdd3: dirty mlocked LRU page recovery: Recovered
[  163.610305] MCE 0x91cdd3: dirty mlocked LRU page still referenced by 1 users

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c8cc57ed7dcd..ec9ad5270d32 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1204,6 +1204,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	for (ps = error_states;; ps++)
 		if ((p->flags & ps->mask) == ps->res)
 			break;
+
+	page_flags |= (p->flags & (1UL << PG_dirty));
+
 	if (!ps->mask)
 		for (ps = error_states;; ps++)
 			if ((page_flags & ps->mask) == ps->res)

From f9121153fdfbfaa930bf65077a5597e20d3ac608 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:52 -0700
Subject: [PATCH 129/303] mm/hwpoison: don't need to hold compound lock for
 hugetlbfs page

compound lock is introduced by commit e9da73d67("thp: compound_lock."), it
is used to serialize put_page against __split_huge_page_refcount().  In
addition, transparent hugepages will be splitted in hwpoison handler and
just one subpage will be poisoned.  There is unnecessary to hold compound
lock for hugetlbfs page.  This patch replace compound_trans_order by
compond_order in the place where the page is hugetlbfs page.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h  | 14 --------------
 mm/memory-failure.c | 12 ++++++------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 03f84b8d7359..caf543c7eaa7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -495,20 +495,6 @@ static inline int compound_order(struct page *page)
 	return (unsigned long)page[1].lru.prev;
 }
 
-static inline int compound_trans_order(struct page *page)
-{
-	int order;
-	unsigned long flags;
-
-	if (!PageHead(page))
-		return 0;
-
-	flags = compound_lock_irqsave(page);
-	order = compound_order(page);
-	compound_unlock_irqrestore(page, flags);
-	return order;
-}
-
 static inline void set_compound_order(struct page *page, unsigned long order)
 {
 	page[1].lru.prev = (void *)order;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ec9ad5270d32..7b5d32507c35 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 #ifdef __ARCH_SI_TRAPNO
 	si.si_trapno = trapno;
 #endif
-	si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT;
+	si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
 
 	if ((flags & MF_ACTION_REQUIRED) && t == current) {
 		si.si_code = BUS_MCEERR_AR;
@@ -983,7 +983,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 static void set_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
-	int nr_pages = 1 << compound_trans_order(hpage);
+	int nr_pages = 1 << compound_order(hpage);
 	for (i = 0; i < nr_pages; i++)
 		SetPageHWPoison(hpage + i);
 }
@@ -991,7 +991,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage)
 static void clear_page_hwpoison_huge_page(struct page *hpage)
 {
 	int i;
-	int nr_pages = 1 << compound_trans_order(hpage);
+	int nr_pages = 1 << compound_order(hpage);
 	for (i = 0; i < nr_pages; i++)
 		ClearPageHWPoison(hpage + i);
 }
@@ -1342,7 +1342,7 @@ int unpoison_memory(unsigned long pfn)
 		return 0;
 	}
 
-	nr_pages = 1 << compound_trans_order(page);
+	nr_pages = 1 << compound_order(page);
 
 	if (!get_page_unless_zero(page)) {
 		/*
@@ -1506,7 +1506,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	} else {
 		set_page_hwpoison_huge_page(hpage);
 		dequeue_hwpoisoned_huge_page(hpage);
-		atomic_long_add(1 << compound_trans_order(hpage),
+		atomic_long_add(1 << compound_order(hpage),
 				&num_poisoned_pages);
 	}
 	return ret;
@@ -1566,7 +1566,7 @@ int soft_offline_page(struct page *page, int flags)
 		if (PageHuge(page)) {
 			set_page_hwpoison_huge_page(hpage);
 			dequeue_hwpoisoned_huge_page(hpage);
-			atomic_long_add(1 << compound_trans_order(hpage),
+			atomic_long_add(1 << compound_order(hpage),
 					&num_poisoned_pages);
 		} else {
 			SetPageHWPoison(page);

From 0cea3fdc416d593072c602725ed2ca02b889f31b Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:53 -0700
Subject: [PATCH 130/303] mm/hwpoison: fix race against poison thp

There is a race between hwpoison page and unpoison page, memory_failure
set the page hwpoison and increase num_poisoned_pages without hold page
lock, and one page count will be accounted against thp for
num_poisoned_pages.  However, unpoison can occur before memory_failure
hold page lock and split transparent hugepage, unpoison will decrease
num_poisoned_pages by 1 << compound_order since memory_failure has not yet
split transparent hugepage with page lock held.  That means we account one
page for hwpoison and 1 << compound_order for unpoison.  This patch fix it
by inserting a PageTransHuge check before doing TestClearPageHWPoison,
unpoison failed without clearing PageHWPoison and decreasing
num_poisoned_pages.

            A                                                 	B
    	memory_failue
        TestSetPageHWPoison(p);
        if (PageHuge(p))
            nr_pages = 1 << compound_order(hpage);
        else
            nr_pages = 1;
        atomic_long_add(nr_pages, &num_poisoned_pages);
                                                            unpoison_memory
	                                                        nr_pages = 1<< compound_trans_order(page);
                                                            if(TestClearPageHWPoison(p))
                                                            atomic_long_sub(nr_pages, &num_poisoned_pages);
        lock page
        if (!PageHWPoison(p))
        	unlock page and return
        hwpoison_user_mappings
        if (PageTransHuge(hpage))
        	split_huge_page(hpage);

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7b5d32507c35..32351ec32048 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1342,6 +1342,16 @@ int unpoison_memory(unsigned long pfn)
 		return 0;
 	}
 
+	/*
+	 * unpoison_memory() can encounter thp only when the thp is being
+	 * worked by memory_failure() and the page lock is not held yet.
+	 * In such case, we yield to memory_failure() and make unpoison fail.
+	 */
+	if (PageTransHuge(page)) {
+		pr_info("MCE: Memory failure is now running on %#lx\n", pfn);
+			return 0;
+	}
+
 	nr_pages = 1 << compound_order(page);
 
 	if (!get_page_unless_zero(page)) {

From dd9538a597f9ccd9a65be1cc3f71059a12b5b4ff Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:54 -0700
Subject: [PATCH 131/303] mm/hwpoison: replace atomic_long_sub() with
 atomic_long_dec()

Replace atomic_long_sub() with atomic_long_dec() since the page is normal
page instead of hugetlbfs page or thp.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 32351ec32048..c69217c07faa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1366,7 +1366,7 @@ int unpoison_memory(unsigned long pfn)
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
-			atomic_long_sub(nr_pages, &num_poisoned_pages);
+			atomic_long_dec(&num_poisoned_pages);
 		pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
 		return 0;
 	}

From 0be35096a145290cc0771d52adb3b241dca22604 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:55 -0700
Subject: [PATCH 132/303] mm/hwpoison: don't set migration type twice to avoid
 holding heavily contend zone->lock

Set pageblock migration type will hold zone->lock which is heavy contended
in system to avoid race.  However, soft offline page will set pageblock
migration type twice during get page if the page is in used, not hugetlbfs
page and not on lru list.  There is unnecessary to set the pageblock
migration type and hold heavy contended zone->lock again if the first
round get page have already set the pageblock to right migration type.

The trick here is migration type is MIGRATE_ISOLATE.  There are other two
parts can change MIGRATE_ISOLATE except hwpoison.  One is memory hoplug,
however, we hold lock_memory_hotplug() which avoid race.  The second is
CMA which umovable page allocation requst can't fallback to.  So it's safe
here.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c69217c07faa..784a1e17c905 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1429,7 +1429,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
 	 * was free. This flag should be kept set until the source page
 	 * is freed and PG_hwpoison on it is set.
 	 */
-	set_migratetype_isolate(p, true);
+	if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
+		set_migratetype_isolate(p, true);
 	/*
 	 * When the target page is a free hugepage, just remove it
 	 * from free hugepage list.

From 86e057734bd1c460c48ae69f8fcc3ed90eb40d59 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:56 -0700
Subject: [PATCH 133/303] mm/hwpoison: drop forward reference declarations
 __soft_offline_page()

Drop forward reference declarations __soft_offline_page.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 130 ++++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 66 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 784a1e17c905..d04f99004c9f 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1523,72 +1523,6 @@ static int soft_offline_huge_page(struct page *page, int flags)
 	return ret;
 }
 
-static int __soft_offline_page(struct page *page, int flags);
-
-/**
- * soft_offline_page - Soft offline a page.
- * @page: page to offline
- * @flags: flags. Same as memory_failure().
- *
- * Returns 0 on success, otherwise negated errno.
- *
- * Soft offline a page, by migration or invalidation,
- * without killing anything. This is for the case when
- * a page is not corrupted yet (so it's still valid to access),
- * but has had a number of corrected errors and is better taken
- * out.
- *
- * The actual policy on when to do that is maintained by
- * user space.
- *
- * This should never impact any application or cause data loss,
- * however it might take some time.
- *
- * This is not a 100% solution for all memory, but tries to be
- * ``good enough'' for the majority of memory.
- */
-int soft_offline_page(struct page *page, int flags)
-{
-	int ret;
-	unsigned long pfn = page_to_pfn(page);
-	struct page *hpage = compound_trans_head(page);
-
-	if (PageHWPoison(page)) {
-		pr_info("soft offline: %#lx page already poisoned\n", pfn);
-		return -EBUSY;
-	}
-	if (!PageHuge(page) && PageTransHuge(hpage)) {
-		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
-			pr_info("soft offline: %#lx: failed to split THP\n",
-				pfn);
-			return -EBUSY;
-		}
-	}
-
-	ret = get_any_page(page, pfn, flags);
-	if (ret < 0)
-		goto unset;
-	if (ret) { /* for in-use pages */
-		if (PageHuge(page))
-			ret = soft_offline_huge_page(page, flags);
-		else
-			ret = __soft_offline_page(page, flags);
-	} else { /* for free pages */
-		if (PageHuge(page)) {
-			set_page_hwpoison_huge_page(hpage);
-			dequeue_hwpoisoned_huge_page(hpage);
-			atomic_long_add(1 << compound_order(hpage),
-					&num_poisoned_pages);
-		} else {
-			SetPageHWPoison(page);
-			atomic_long_inc(&num_poisoned_pages);
-		}
-	}
-unset:
-	unset_migratetype_isolate(page, MIGRATE_MOVABLE);
-	return ret;
-}
-
 static int __soft_offline_page(struct page *page, int flags)
 {
 	int ret;
@@ -1675,3 +1609,67 @@ static int __soft_offline_page(struct page *page, int flags)
 	}
 	return ret;
 }
+
+/**
+ * soft_offline_page - Soft offline a page.
+ * @page: page to offline
+ * @flags: flags. Same as memory_failure().
+ *
+ * Returns 0 on success, otherwise negated errno.
+ *
+ * Soft offline a page, by migration or invalidation,
+ * without killing anything. This is for the case when
+ * a page is not corrupted yet (so it's still valid to access),
+ * but has had a number of corrected errors and is better taken
+ * out.
+ *
+ * The actual policy on when to do that is maintained by
+ * user space.
+ *
+ * This should never impact any application or cause data loss,
+ * however it might take some time.
+ *
+ * This is not a 100% solution for all memory, but tries to be
+ * ``good enough'' for the majority of memory.
+ */
+int soft_offline_page(struct page *page, int flags)
+{
+	int ret;
+	unsigned long pfn = page_to_pfn(page);
+	struct page *hpage = compound_trans_head(page);
+
+	if (PageHWPoison(page)) {
+		pr_info("soft offline: %#lx page already poisoned\n", pfn);
+		return -EBUSY;
+	}
+	if (!PageHuge(page) && PageTransHuge(hpage)) {
+		if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) {
+			pr_info("soft offline: %#lx: failed to split THP\n",
+				pfn);
+			return -EBUSY;
+		}
+	}
+
+	ret = get_any_page(page, pfn, flags);
+	if (ret < 0)
+		goto unset;
+	if (ret) { /* for in-use pages */
+		if (PageHuge(page))
+			ret = soft_offline_huge_page(page, flags);
+		else
+			ret = __soft_offline_page(page, flags);
+	} else { /* for free pages */
+		if (PageHuge(page)) {
+			set_page_hwpoison_huge_page(hpage);
+			dequeue_hwpoisoned_huge_page(hpage);
+			atomic_long_add(1 << compound_order(hpage),
+					&num_poisoned_pages);
+		} else {
+			SetPageHWPoison(page);
+			atomic_long_inc(&num_poisoned_pages);
+		}
+	}
+unset:
+	unset_migratetype_isolate(page, MIGRATE_MOVABLE);
+	return ret;
+}

From b194b8cdb83daafd2405fb902193b8e904107614 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:57 -0700
Subject: [PATCH 134/303] mm/hwpoison: add '#' to madvise_hwpoison

Add '#' to madvise_hwpoison.

Before patch:

[   95.892866] Injecting memory failure for page 19d0 at b7786000
[   95.893151] MCE 0x19d0: non LRU page recovery: Ignored

After patch:

[   95.892866] Injecting memory failure for page 0x19d0 at 0xb7786000
[   95.893151] MCE 0x19d0: non LRU page recovery: Ignored

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 936799f042cc..9b1c7be182d7 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -353,14 +353,14 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 		if (ret != 1)
 			return ret;
 		if (bhv == MADV_SOFT_OFFLINE) {
-			printk(KERN_INFO "Soft offlining page %lx at %lx\n",
+			pr_info("Soft offlining page %#lx at %#lx\n",
 				page_to_pfn(p), start);
 			ret = soft_offline_page(p, MF_COUNT_INCREASED);
 			if (ret)
 				break;
 			continue;
 		}
-		printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n",
+		pr_info("Injecting memory failure for page %#lx at %#lx\n",
 		       page_to_pfn(p), start);
 		/* Ignore return value for now */
 		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);

From 29b4eedee67b449534214058e1bcb36307a7f1dc Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:22:59 -0700
Subject: [PATCH 135/303] mm/hwpoison.c: fix held reference count after
 unpoisoning empty zero page

madvise hwpoison inject will poison the read-only empty zero page if there
is no write access before poison.  Empty zero page reference count will be
increased for hwpoison, subsequent poison zero page will return directly
since page has already been set PG_hwpoison, however, page reference count
is still increased by get_user_pages_fast.  The unpoison process will
unpoison the empty zero page and decrease the reference count successfully
for the fist time, however, subsequent unpoison empty zero page will
return directly since page has already been unpoisoned and without
decrease the page reference count of empty zero page.

This patch fixes it by make madvise_hwpoison() put a page and return
immediately (without calling memory_failure() or soft_offline_page()) when
the page is already hwpoisoned.

Testcase:

#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <errno.h>

#define PAGES_TO_TEST 3
#define PAGE_SIZE	4096

int main(void)
{
	char *mem;
	int i;

	mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE,
			PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);

	if (madvise(mem, PAGES_TO_TEST * PAGE_SIZE, MADV_HWPOISON) == -1)
		return -1;

	munmap(mem, PAGES_TO_TEST * PAGE_SIZE);

	return 0;
}

Add printk to dump page reference count:

[   93.075959] Injecting memory failure for page 0x19d0 at 0xb77d8000
[   93.076207] MCE 0x19d0: non LRU page recovery: Ignored
[   93.076209] pfn 0x19d0, page count = 1 after memory failure
[   93.076220] Injecting memory failure for page 0x19d0 at 0xb77d9000
[   93.076221] MCE 0x19d0: already hardware poisoned
[   93.076222] pfn 0x19d0, page count = 2 after memory failure
[   93.076224] Injecting memory failure for page 0x19d0 at 0xb77da000
[   93.076224] MCE 0x19d0: already hardware poisoned
[   93.076225] pfn 0x19d0, page count = 3 after memory failure

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Suggested-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mm/madvise.c b/mm/madvise.c
index 9b1c7be182d7..30293ab95b06 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -352,6 +352,10 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 		int ret = get_user_pages_fast(start, 1, 0, &p);
 		if (ret != 1)
 			return ret;
+		if (PageHWPoison(p)) {
+			put_page(p);
+			continue;
+		}
 		if (bhv == MADV_SOFT_OFFLINE) {
 			pr_info("Soft offlining page %#lx at %#lx\n",
 				page_to_pfn(p), start);

From 2d1e8b3f1acc36101dbe6c5fc14e88e3f6af0b1c Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:23:00 -0700
Subject: [PATCH 136/303] mm/hwpoison-inject.c: change permission of
 corrupt-pfn/unpoison-pfn to 0200

Hwpoison injection doesn't implement read method for
corrupt-pfn/unpoison-pfn attributes:

# cat /sys/kernel/debug/hwpoison/corrupt-pfn
cat: /sys/kernel/debug/hwpoison/corrupt-pfn: Permission denied
# cat /sys/kernel/debug/hwpoison/unpoison-pfn
cat: /sys/kernel/debug/hwpoison/unpoison-pfn: Permission denied

This patch changes the permission of corrupt-pfn/unpoison-pfn to 0200.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hwpoison-inject.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 3a61efc518d5..afc2daa91c60 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -88,12 +88,12 @@ static int pfn_inject_init(void)
 	 * hardware status change, hence do not require hardware support.
 	 * They are mainly for testing hwpoison in software level.
 	 */
-	dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir,
+	dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
 					  NULL, &hwpoison_fops);
 	if (!dentry)
 		goto fail;
 
-	dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir,
+	dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
 				     NULL, &unpoison_fops);
 	if (!dentry)
 		goto fail;

From 3ba5eebc40a9839226e5f0d81a3e9f8fcfb8ebae Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:23:01 -0700
Subject: [PATCH 137/303] mm/memory-failure.c: fix bug triggered by unpoisoning
 empty zero page

  Injecting memory failure for page 0x19d0 at 0xb77d2000
  MCE 0x19d0: non LRU page recovery: Ignored
  MCE: Software-unpoisoned page 0x19d0
  BUG: Bad page state in process bash  pfn:019d0
  page:f3461a00 count:0 mapcount:0 mapping:  (null) index:0x0
  page flags: 0x40000404(referenced|reserved)
  Modules linked in: nfsd auth_rpcgss i915 nfs_acl nfs lockd video drm_kms_helper drm bnep rfcomm sunrpc bluetooth psmouse parport_pc ppdev lp serio_raw fscache parport gpio_ich lpc_ich mac_hid i2c_algo_bit tpm_tis wmi usb_storage hid_generic usbhid hid e1000e firewire_ohci firewire_core ahci ptp libahci pps_core crc_itu_t
  CPU: 3 PID: 2123 Comm: bash Not tainted 3.11.0-rc6+ #12
  Hardware name: LENOVO 7034DD7/        , BIOS 9HKT47AUS 01//2012
   00000000 00000000 e9625ea0 c15ec49b f3461a00 e9625eb8 c15ea119 c17cbf18
   ef084314 000019d0 f3461a00 e9625ed8 c110dc8a f3461a00 00000001 00000000
   f3461a00 40000404 00000000 e9625ef8 c110dcc1 f3461a00 f3461a00 000019d0
  Call Trace:
    dump_stack+0x41/0x52
    bad_page+0xcf/0xeb
    free_pages_prepare+0x12a/0x140
    free_hot_cold_page+0x21/0x110
    __put_single_page+0x21/0x30
    put_page+0x25/0x40
    unpoison_memory+0x107/0x200
    hwpoison_unpoison+0x20/0x30
    simple_attr_write+0xb6/0xd0
    vfs_write+0xa0/0x1b0
    SyS_write+0x4f/0x90
    sysenter_do_call+0x12/0x22
  Disabling lock debugging due to kernel taint

Testcase:

#define _GNU_SOURCE
#include <stdlib.h>
#include <stdio.h>
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <errno.h>

#define PAGES_TO_TEST 1
#define PAGE_SIZE	4096

int main(void)
{
	char *mem;

	mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE,
			PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);

	if (madvise(mem, PAGES_TO_TEST * PAGE_SIZE, MADV_HWPOISON) == -1)
		return -1;

	munmap(mem, PAGES_TO_TEST * PAGE_SIZE);

	return 0;
}

There is one page reference count for default empty zero page,
madvise_hwpoison add another one by get_user_pages_fast.  memory_hwpoison
reduce one page reference count since it's a non LRU page.
unpoison_memory release the last page reference count and free empty zero
page to buddy system which is not correct since empty zero page has
PG_reserved flag.  This patch fix it by don't reduce the page reference
count under 1 against empty zero page.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d04f99004c9f..d472e14c6808 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1388,7 +1388,7 @@ int unpoison_memory(unsigned long pfn)
 	unlock_page(page);
 
 	put_page(page);
-	if (freeit)
+	if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
 		put_page(page);
 
 	return 0;

From 8302423b8e85ad6caa8687f06157d43f684a42e2 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:23:02 -0700
Subject: [PATCH 138/303] mm/madvise.c: fix return value of madvise_hwpoison()

The return value outside for loop is always zero which means
madvise_hwpoison return success, however, this is not truth for
soft_offline_page w/ failure return value.

Signed-off-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 30293ab95b06..51bffa414027 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -361,7 +361,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 				page_to_pfn(p), start);
 			ret = soft_offline_page(p, MF_COUNT_INCREASED);
 			if (ret)
-				break;
+				return ret;
 			continue;
 		}
 		pr_info("Injecting memory failure for page %#lx at %#lx\n",

From 325c4ef5c4b17372c3222d896040d7848e67fbdb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 11 Sep 2013 14:23:03 -0700
Subject: [PATCH 139/303] mm/madvise.c:madvise_hwpoison(): remove local `ret'

madvise_hwpoison() has two locals called "ret".  Fix it all up.

Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index 51bffa414027..6975bc812542 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -343,15 +343,16 @@ static long madvise_remove(struct vm_area_struct *vma,
  */
 static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 {
-	int ret = 0;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	for (; start < end; start += PAGE_SIZE) {
 		struct page *p;
-		int ret = get_user_pages_fast(start, 1, 0, &p);
+		int ret;
+
+		ret = get_user_pages_fast(start, 1, 0, &p);
 		if (ret != 1)
 			return ret;
+
 		if (PageHWPoison(p)) {
 			put_page(p);
 			continue;
@@ -369,7 +370,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end)
 		/* Ignore return value for now */
 		memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED);
 	}
-	return ret;
+	return 0;
 }
 #endif
 

From 146d7009b45cdb45ec3be8ad73177dae58f4bc91 Mon Sep 17 00:00:00 2001
From: Junxiao Bi <junxiao.bi@oracle.com>
Date: Wed, 11 Sep 2013 14:23:04 -0700
Subject: [PATCH 140/303] writeback: fix race that cause writeback hung

There is a race between mark inode dirty and writeback thread, see the
following scenario.  In this case, writeback thread will not run though
there is dirty_io.

__mark_inode_dirty()                                          bdi_writeback_workfn()
	...                                                       	...
	spin_lock(&inode->i_lock);
	...
	if (bdi_cap_writeback_dirty(bdi)) {
	    <<< assume wb has dirty_io, so wakeup_bdi is false.
	    <<< the following inode_dirty also have wakeup_bdi false.
	    if (!wb_has_dirty_io(&bdi->wb))
		    wakeup_bdi = true;
	}
	spin_unlock(&inode->i_lock);
	                                                            <<< assume last dirty_io is removed here.
	                                                            pages_written = wb_do_writeback(wb);
	                                                            ...
	                                                            <<< work_list empty and wb has no dirty_io,
	                                                            <<< delayed_work will not be queued.
	                                                            if (!list_empty(&bdi->work_list) ||
	                                                                (wb_has_dirty_io(wb) && dirty_writeback_interval))
	                                                                queue_delayed_work(bdi_wq, &wb->dwork,
	                                                                    msecs_to_jiffies(dirty_writeback_interval * 10));
	spin_lock(&bdi->wb.list_lock);
	inode->dirtied_when = jiffies;
	<<< new dirty_io is added.
	list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
	spin_unlock(&bdi->wb.list_lock);

	<<< though there is dirty_io, but wakeup_bdi is false,
	<<< so writeback thread will not be waked up and
	<<< the new dirty_io will not be flushed.
	if (wakeup_bdi)
	    bdi_wakeup_thread_delayed(bdi);

Writeback will run until there is a new flush work queued.  This may cause
a lot of dirty pages stay in memory for a long time.

Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fs-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 54b3c31c2f0d..30f6f27d5a59 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1171,6 +1171,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			bool wakeup_bdi = false;
 			bdi = inode_to_bdi(inode);
 
+			spin_unlock(&inode->i_lock);
+			spin_lock(&bdi->wb.list_lock);
 			if (bdi_cap_writeback_dirty(bdi)) {
 				WARN(!test_bit(BDI_registered, &bdi->state),
 				     "bdi-%s not registered\n", bdi->name);
@@ -1185,8 +1187,6 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 					wakeup_bdi = true;
 			}
 
-			spin_unlock(&inode->i_lock);
-			spin_lock(&bdi->wb.list_lock);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
 			spin_unlock(&bdi->wb.list_lock);

From 2d8a17813ec817fa58addd2c92b4ca8cae5bafbb Mon Sep 17 00:00:00 2001
From: Yanchuan Nian <ycnian@gmail.com>
Date: Wed, 11 Sep 2013 14:23:05 -0700
Subject: [PATCH 141/303] mm/mmap: remove unnecessary assignment

pgoff is not used after the statement "pgoff = vma->vm_pgoff;", so the
assignment is redundant.

Signed-off-by: Yanchuan Nian <ycnian@gmail.com>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index 51958d192a48..9d548512ff8a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1566,7 +1566,6 @@ munmap_back:
 		WARN_ON_ONCE(addr != vma->vm_start);
 
 		addr = vma->vm_start;
-		pgoff = vma->vm_pgoff;
 		vm_flags = vma->vm_flags;
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);

From ade34a35722fab0c8a1d162a15b919d20373a894 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:06 -0700
Subject: [PATCH 142/303] lib/genalloc.c: convert kmalloc_node(...GFP_ZERO...)
 to kzalloc_node(...)

Use the helper function instead of __GFP_ZERO.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/genalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/genalloc.c b/lib/genalloc.c
index 2a39bf62d8c1..c522facfa3e5 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -187,7 +187,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 	int nbytes = sizeof(struct gen_pool_chunk) +
 				BITS_TO_LONGS(nbits) * sizeof(long);
 
-	chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid);
+	chunk = kzalloc_node(nbytes, GFP_KERNEL, nid);
 	if (unlikely(chunk == NULL))
 		return -ENOMEM;
 

From 7b5219db00d0afaf3d2b0e8c443ffa892455ba75 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:07 -0700
Subject: [PATCH 143/303] mm/mempool.c: convert kmalloc_node(...GFP_ZERO...) to
 kzalloc_node(...)

Use the helper function instead of __GFP_ZERO.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempool.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/mempool.c b/mm/mempool.c
index 54990476c049..659aa42bad16 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 			       gfp_t gfp_mask, int node_id)
 {
 	mempool_t *pool;
-	pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id);
+	pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id);
 	if (!pool)
 		return NULL;
 	pool->elements = kmalloc_node(min_nr * sizeof(void *),

From 2bff24a3707093c435ab3241c47dcdb5f16e432b Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Wed, 11 Sep 2013 14:23:08 -0700
Subject: [PATCH 144/303] memcg: fix multiple large threshold notifications

A memory cgroup with (1) multiple threshold notifications and (2) at least
one threshold >=2G was not reliable.  Specifically the notifications would
either not fire or would not fire in the proper order.

The __mem_cgroup_threshold() signaling logic depends on keeping 64 bit
thresholds in sorted order.  mem_cgroup_usage_register_event() sorts them
with compare_thresholds(), which returns the difference of two 64 bit
thresholds as an int.  If the difference is positive but has bit[31] set,
then sort() treats the difference as negative and breaks sort order.

This fix compares the two arbitrary 64 bit thresholds returning the
classic -1, 0, 1 result.

The test below sets two notifications (at 0x1000 and 0x81001000):
  cd /sys/fs/cgroup/memory
  mkdir x
  for x in 4096 2164264960; do
    cgroup_event_listener x/memory.usage_in_bytes $x | sed "s/^/$x listener:/" &
  done
  echo $$ > x/cgroup.procs
  anon_leaker 500M

v3.11-rc7 fails to signal the 4096 event listener:
  Leaking...
  Done leaking pages.

Patched v3.11-rc7 properly notifies:
  Leaking...
  4096 listener:2013:8:31:14:13:36
  Done leaking pages.

The fixed bug is old.  It appears to date back to the introduction of
memcg threshold notifications in v2.6.34-rc1-116-g2e72b6347c94 "memcg:
implement memory thresholds"

Signed-off-by: Greg Thelen <gthelen@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5ca1dcf77ce9..c6bd28edd533 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5591,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b)
 	const struct mem_cgroup_threshold *_a = a;
 	const struct mem_cgroup_threshold *_b = b;
 
-	return _a->threshold - _b->threshold;
+	if (_a->threshold > _b->threshold)
+		return 1;
+
+	if (_a->threshold < _b->threshold)
+		return -1;
+
+	return 0;
 }
 
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)

From 729377d559607ea40d714e8f7092f40f643cf01f Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.kh@samsung.com>
Date: Wed, 11 Sep 2013 14:23:09 -0700
Subject: [PATCH 145/303] pnp: change pnp bus pm_ops to invoke pnp driver
 dev_pm_ops if specified
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pnp_bus_suspend() and pnp_bus_resume() invoke legacy pm_ops from
pnp_driver.  Changed pnp_bus_suspend() and pnp_bus_resume() to check if
pnp driver has dev_pm_ops and call.  If dev_pm_ops don't exist, then call
use legacy pm_ops.  Without this change, pnp_driver dev_pm_ops will not
get called.

In addition to the pnp driver bus pm_ops change to invoke driver
dev_pm_ops, this patch set contains changes to rtc-cmos, tpm_tis, and
apple-gmux pnp drivers to convert from legacy pm_ops to dev_pm_ops.

This patch (of 4):

pnp_bus_suspend() and pnp_bus_resume() invoke legacy pm_ops from
pnp_driver.  Changed pnp_bus_suspend() and pnp_bus_resume() to check if
pnp driver has dev_pm_ops and call.  If dev_pm_ops don't exist, then call
use legacy pm_ops.  Without this change, pnp_driver dev_pm_ops will not
get called.

Signed-off-by: Shuah Khan <shuah.kh@samsung.com>
Cc: Matthew Garrett <matthew.garrett@nebula.com>
Cc: Leonidas Da Silva Barbosa <leosilva@linux.vnet.ibm.com>
Cc: Ashley Lai <ashley@ashleylai.com>
Cc: Rajiv Andrade <mail@srajiv.net>
Cc: Marcel Selhorst <tpmdd@selhorst.net>
Cc: Sirrix AG <tpmdd@sirrix.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Peter Hüwe <PeterHuewe@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pnp/driver.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c
index 12adb43a0693..a39ee38a9414 100644
--- a/drivers/pnp/driver.c
+++ b/drivers/pnp/driver.c
@@ -163,6 +163,13 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state)
 	if (!pnp_drv)
 		return 0;
 
+	if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) {
+		error = pnp_drv->driver.pm->suspend(dev);
+		suspend_report_result(pnp_drv->driver.pm->suspend, error);
+		if (error)
+			return error;
+	}
+
 	if (pnp_drv->suspend) {
 		error = pnp_drv->suspend(pnp_dev, state);
 		if (error)
@@ -211,6 +218,12 @@ static int pnp_bus_resume(struct device *dev)
 			return error;
 	}
 
+	if (pnp_drv->driver.pm && pnp_drv->driver.pm->resume) {
+		error = pnp_drv->driver.pm->resume(dev);
+		if (error)
+			return error;
+	}
+
 	if (pnp_drv->resume) {
 		error = pnp_drv->resume(pnp_dev);
 		if (error)

From a8a3808b43a077fbc738b26dc84d18b5db3044f9 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.kh@samsung.com>
Date: Wed, 11 Sep 2013 14:23:11 -0700
Subject: [PATCH 146/303] rtc: convert rtc-cmos to dev_pm_ops from legacy
 pm_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert drivers/rtc/rtc-cmos to use dev_pm_ops instead of legacy pm_ops.
This patch depends on pnp driver bus ops change to invoke pnp_driver
dev_pm_ops.

Signed-off-by: Shuah Khan <shuah.kh@samsung.com>
Cc: Matthew Garrett <matthew.garrett@nebula.com>
Cc: Leonidas Da Silva Barbosa <leosilva@linux.vnet.ibm.com>
Cc: Ashley Lai <ashley@ashleylai.com>
Cc: Rajiv Andrade <mail@srajiv.net>
Cc: Marcel Selhorst <tpmdd@selhorst.net>
Cc: Sirrix AG <tpmdd@sirrix.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Peter Hüwe <PeterHuewe@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-cmos.c | 24 +++++-------------------
 1 file changed, 5 insertions(+), 19 deletions(-)

diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index be06d7150de5..24e733c98f8b 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -1018,23 +1018,6 @@ static void __exit cmos_pnp_remove(struct pnp_dev *pnp)
 	cmos_do_remove(&pnp->dev);
 }
 
-#ifdef	CONFIG_PM
-
-static int cmos_pnp_suspend(struct pnp_dev *pnp, pm_message_t mesg)
-{
-	return cmos_suspend(&pnp->dev);
-}
-
-static int cmos_pnp_resume(struct pnp_dev *pnp)
-{
-	return cmos_resume(&pnp->dev);
-}
-
-#else
-#define	cmos_pnp_suspend	NULL
-#define	cmos_pnp_resume		NULL
-#endif
-
 static void cmos_pnp_shutdown(struct pnp_dev *pnp)
 {
 	if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pnp->dev))
@@ -1060,8 +1043,11 @@ static struct pnp_driver cmos_pnp_driver = {
 
 	/* flag ensures resume() gets called, and stops syslog spam */
 	.flags		= PNP_DRIVER_RES_DO_NOT_CHANGE,
-	.suspend	= cmos_pnp_suspend,
-	.resume		= cmos_pnp_resume,
+#ifdef CONFIG_PM_SLEEP
+	.driver		= {
+			.pm = &cmos_pm_ops,
+	},
+#endif
 };
 
 #endif	/* CONFIG_PNP */

From a2fa3fb0d9a0169b10789ea3e5ea7168494df93c Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.kh@samsung.com>
Date: Wed, 11 Sep 2013 14:23:13 -0700
Subject: [PATCH 147/303] tpm: convert tpm_tis driver to use dev_pm_ops from
 legacy pm_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert drivers/char/tpm/tpm_tis.c to use dev_pm_ops instead of legacy
pm_ops.  This patch depends on pnp driver bus ops change to invoke
pnp_driver dev_pm_ops.

Signed-off-by: Shuah Khan <shuah.kh@samsung.com>
Cc: Matthew Garrett <matthew.garrett@nebula.com>
Cc: Leonidas Da Silva Barbosa <leosilva@linux.vnet.ibm.com>
Cc: Ashley Lai <ashley@ashleylai.com>
Cc: Rajiv Andrade <mail@srajiv.net>
Cc: Marcel Selhorst <tpmdd@selhorst.net>
Cc: Sirrix AG <tpmdd@sirrix.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Peter Hüwe <PeterHuewe@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tpm/tpm_tis.c | 60 +++++++++++++++-----------------------
 1 file changed, 24 insertions(+), 36 deletions(-)

diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c
index 4519cb332987..5796d0157ce0 100644
--- a/drivers/char/tpm/tpm_tis.c
+++ b/drivers/char/tpm/tpm_tis.c
@@ -766,6 +766,25 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip)
 }
 #endif
 
+#ifdef CONFIG_PM_SLEEP
+static int tpm_tis_resume(struct device *dev)
+{
+	struct tpm_chip *chip = dev_get_drvdata(dev);
+	int ret;
+
+	if (chip->vendor.irq)
+		tpm_tis_reenable_interrupts(chip);
+
+	ret = tpm_pm_resume(dev);
+	if (!ret)
+		tpm_do_selftest(chip);
+
+	return ret;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume);
+
 #ifdef CONFIG_PNP
 static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev,
 				      const struct pnp_device_id *pnp_id)
@@ -787,26 +806,6 @@ static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev,
 	return tpm_tis_init(&pnp_dev->dev, start, len, irq);
 }
 
-static int tpm_tis_pnp_suspend(struct pnp_dev *dev, pm_message_t msg)
-{
-	return tpm_pm_suspend(&dev->dev);
-}
-
-static int tpm_tis_pnp_resume(struct pnp_dev *dev)
-{
-	struct tpm_chip *chip = pnp_get_drvdata(dev);
-	int ret;
-
-	if (chip->vendor.irq)
-		tpm_tis_reenable_interrupts(chip);
-
-	ret = tpm_pm_resume(&dev->dev);
-	if (!ret)
-		tpm_do_selftest(chip);
-
-	return ret;
-}
-
 static struct pnp_device_id tpm_pnp_tbl[] = {
 	{"PNP0C31", 0},		/* TPM */
 	{"ATM1200", 0},		/* Atmel */
@@ -835,9 +834,12 @@ static struct pnp_driver tis_pnp_driver = {
 	.name = "tpm_tis",
 	.id_table = tpm_pnp_tbl,
 	.probe = tpm_tis_pnp_init,
-	.suspend = tpm_tis_pnp_suspend,
-	.resume = tpm_tis_pnp_resume,
 	.remove = tpm_tis_pnp_remove,
+#ifdef CONFIG_PM_SLEEP
+	.driver	= {
+		.pm = &tpm_tis_pm,
+	},
+#endif
 };
 
 #define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2
@@ -846,20 +848,6 @@ module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id,
 MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe");
 #endif
 
-#ifdef CONFIG_PM_SLEEP
-static int tpm_tis_resume(struct device *dev)
-{
-	struct tpm_chip *chip = dev_get_drvdata(dev);
-
-	if (chip->vendor.irq)
-		tpm_tis_reenable_interrupts(chip);
-
-	return tpm_pm_resume(dev);
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume);
-
 static struct platform_driver tis_drv = {
 	.driver = {
 		.name = "tpm_tis",

From 8aa6c2166b5184fb2344062cf2fa229b197c1f84 Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.kh@samsung.com>
Date: Wed, 11 Sep 2013 14:23:15 -0700
Subject: [PATCH 148/303] platform: convert apple-gmux driver to dev_pm_ops
 from legacy pm_ops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convert drivers/platform/x86/apple-gmux to use dev_pm_ops instead of
legacy pm_ops.  This patch depends on pnp driver bus ops change to invoke
pnp_driver dev_pm_ops.

Signed-off-by: Shuah Khan <shuah.kh@samsung.com>
Cc: Matthew Garrett <matthew.garrett@nebula.com>
Cc: Leonidas Da Silva Barbosa <leosilva@linux.vnet.ibm.com>
Cc: Ashley Lai <ashley@ashleylai.com>
Cc: Rajiv Andrade <mail@srajiv.net>
Cc: Marcel Selhorst <tpmdd@selhorst.net>
Cc: Sirrix AG <tpmdd@sirrix.com>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Peter Hüwe <PeterHuewe@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/platform/x86/apple-gmux.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c
index f74bfcbb7bad..8eea2efbbb6d 100644
--- a/drivers/platform/x86/apple-gmux.c
+++ b/drivers/platform/x86/apple-gmux.c
@@ -393,17 +393,21 @@ static void gmux_notify_handler(acpi_handle device, u32 value, void *context)
 		complete(&gmux_data->powerchange_done);
 }
 
-static int gmux_suspend(struct pnp_dev *pnp, pm_message_t state)
+static int gmux_suspend(struct device *dev)
 {
+	struct pnp_dev *pnp = to_pnp_dev(dev);
 	struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp);
+
 	gmux_data->resume_client_id = gmux_active_client(gmux_data);
 	gmux_disable_interrupts(gmux_data);
 	return 0;
 }
 
-static int gmux_resume(struct pnp_dev *pnp)
+static int gmux_resume(struct device *dev)
 {
+	struct pnp_dev *pnp = to_pnp_dev(dev);
 	struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp);
+
 	gmux_enable_interrupts(gmux_data);
 	gmux_switchto(gmux_data->resume_client_id);
 	if (gmux_data->power_state == VGA_SWITCHEROO_OFF)
@@ -605,13 +609,19 @@ static const struct pnp_device_id gmux_device_ids[] = {
 	{"", 0}
 };
 
+static const struct dev_pm_ops gmux_dev_pm_ops = {
+	.suspend = gmux_suspend,
+	.resume = gmux_resume,
+};
+
 static struct pnp_driver gmux_pnp_driver = {
 	.name		= "apple-gmux",
 	.probe		= gmux_probe,
 	.remove		= gmux_remove,
 	.id_table	= gmux_device_ids,
-	.suspend	= gmux_suspend,
-	.resume		= gmux_resume
+	.driver		= {
+			.pm = &gmux_dev_pm_ops,
+	},
 };
 
 static int __init apple_gmux_init(void)

From 20d0e57017b69e7e4ae7166c43f3a3f023ab9702 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:23:17 -0700
Subject: [PATCH 149/303] drivers/firmware/google/gsmi.c: replace
 strict_strtoul() with kstrtoul()

The use of strict_strtoul() is not preferred, because strict_strtoul() is
obsolete.  Thus, kstrtoul() should be used.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Matt Fleming <matt.fleming@intel.com>
Cc: Tom Gundersen <teg@jklm.no>
Cc: Mike Waychison <mikew@google.com>
Acked-by: Mike Waychison <mikew@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/google/gsmi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index acba0b9f4406..6eb535ffeddc 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -525,7 +525,7 @@ static ssize_t gsmi_clear_eventlog_store(struct kobject *kobj,
 		u32 data_type;
 	} param;
 
-	rc = strict_strtoul(buf, 0, &val);
+	rc = kstrtoul(buf, 0, &val);
 	if (rc)
 		return rc;
 

From 3ddc5b46a8e90f3c9251338b60191d0a804b0d92 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Wed, 11 Sep 2013 14:23:18 -0700
Subject: [PATCH 150/303] kernel-wide: fix missing validations on
 __get/__put/__copy_to/__copy_from_user()

I found the following pattern that leads in to interesting findings:

  grep -r "ret.*|=.*__put_user" *
  grep -r "ret.*|=.*__get_user" *
  grep -r "ret.*|=.*__copy" *

The __put_user() calls in compat_ioctl.c, ptrace compat, signal compat,
since those appear in compat code, we could probably expect the kernel
addresses not to be reachable in the lower 32-bit range, so I think they
might not be exploitable.

For the "__get_user" cases, I don't think those are exploitable: the worse
that can happen is that the kernel will copy kernel memory into in-kernel
buffers, and will fail immediately afterward.

The alpha csum_partial_copy_from_user() seems to be missing the
access_ok() check entirely.  The fix is inspired from x86.  This could
lead to information leak on alpha.  I also noticed that many architectures
map csum_partial_copy_from_user() to csum_partial_copy_generic(), but I
wonder if the latter is performing the access checks on every
architectures.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/lib/csum_partial_copy.c |  5 +++
 arch/sparc/kernel/sys_sparc32.c    | 12 +++----
 block/compat_ioctl.c               |  2 +-
 kernel/signal.c                    |  4 +--
 net/socket.c                       | 50 +++++++++++++++---------------
 5 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c
index 40736da9bea8..ffb19b7da999 100644
--- a/arch/alpha/lib/csum_partial_copy.c
+++ b/arch/alpha/lib/csum_partial_copy.c
@@ -338,6 +338,11 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len,
 	unsigned long doff = 7 & (unsigned long) dst;
 
 	if (len) {
+		if (!access_ok(VERIFY_READ, src, len)) {
+			*errp = -EFAULT;
+			memset(dst, 0, len);
+			return sum;
+		}
 		if (!doff) {
 			if (!soff)
 				checksum = csum_partial_cfu_aligned(
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 3d0ddbc005fe..71368850dfc0 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -169,10 +169,10 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 		new_ka.ka_restorer = restorer;
 		ret = get_user(u_handler, &act->sa_handler);
 		new_ka.sa.sa_handler =  compat_ptr(u_handler);
-		ret |= __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t));
+		ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t));
 		sigset_from_compat(&new_ka.sa.sa_mask, &set32);
-		ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
-		ret |= __get_user(u_restorer, &act->sa_restorer);
+		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
+		ret |= get_user(u_restorer, &act->sa_restorer);
 		new_ka.sa.sa_restorer = compat_ptr(u_restorer);
                 if (ret)
                 	return -EFAULT;
@@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig,
 	if (!ret && oact) {
 		sigset_to_compat(&set32, &old_ka.sa.sa_mask);
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler);
-		ret |= __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t));
-		ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
-		ret |= __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer);
+		ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t));
+		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer);
 		if (ret)
 			ret = -EFAULT;
         }
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 7e5d474dc6ba..fbd5a67cb773 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -70,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
 		return ret;
 
 	ret = copy_to_user(ugeo, &geo, 4);
-	ret |= __put_user(geo.start, &ugeo->start);
+	ret |= put_user(geo.start, &ugeo->start);
 	if (ret)
 		ret = -EFAULT;
 
diff --git a/kernel/signal.c b/kernel/signal.c
index 50e41075ac77..ded28b91fa53 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		new_ka.sa.sa_restorer = compat_ptr(restorer);
 #endif
 		ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask));
-		ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags);
+		ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags);
 		if (ret)
 			return -EFAULT;
 		sigset_from_compat(&new_ka.sa.sa_mask, &mask);
@@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig,
 		ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), 
 			       &oact->sa_handler);
 		ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask));
-		ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags);
+		ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags);
 #ifdef __ARCH_HAS_SA_RESTORER
 		ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer),
 				&oact->sa_restorer);
diff --git a/net/socket.c b/net/socket.c
index b2d7c629eeb9..0ceaa5cb9ead 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -3072,12 +3072,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 
 	uifmap32 = &uifr32->ifr_ifru.ifru_map;
 	err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
-	err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-	err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-	err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-	err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq);
-	err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma);
-	err |= __get_user(ifr.ifr_map.port, &uifmap32->port);
+	err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
+	err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
+	err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
+	err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
+	err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
+	err |= get_user(ifr.ifr_map.port, &uifmap32->port);
 	if (err)
 		return -EFAULT;
 
@@ -3088,12 +3088,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 
 	if (cmd == SIOCGIFMAP && !err) {
 		err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
-		err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
-		err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
-		err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
-		err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq);
-		err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma);
-		err |= __put_user(ifr.ifr_map.port, &uifmap32->port);
+		err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
+		err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
+		err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
+		err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
+		err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
+		err |= put_user(ifr.ifr_map.port, &uifmap32->port);
 		if (err)
 			err = -EFAULT;
 	}
@@ -3167,25 +3167,25 @@ static int routing_ioctl(struct net *net, struct socket *sock,
 		struct in6_rtmsg32 __user *ur6 = argp;
 		ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst),
 			3 * sizeof(struct in6_addr));
-		ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
-		ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
-		ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
-		ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
-		ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
-		ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
-		ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
+		ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type));
+		ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len));
+		ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len));
+		ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric));
+		ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info));
+		ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags));
+		ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex));
 
 		r = (void *) &r6;
 	} else { /* ipv4 */
 		struct rtentry32 __user *ur4 = argp;
 		ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst),
 					3 * sizeof(struct sockaddr));
-		ret |= __get_user(r4.rt_flags, &(ur4->rt_flags));
-		ret |= __get_user(r4.rt_metric, &(ur4->rt_metric));
-		ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu));
-		ret |= __get_user(r4.rt_window, &(ur4->rt_window));
-		ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt));
-		ret |= __get_user(rtdev, &(ur4->rt_dev));
+		ret |= get_user(r4.rt_flags, &(ur4->rt_flags));
+		ret |= get_user(r4.rt_metric, &(ur4->rt_metric));
+		ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu));
+		ret |= get_user(r4.rt_window, &(ur4->rt_window));
+		ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt));
+		ret |= get_user(rtdev, &(ur4->rt_dev));
 		if (rtdev) {
 			ret |= copy_from_user(devname, compat_ptr(rtdev), 15);
 			r4.rt_dev = (char __user __force *)devname;

From 54a33b1b1470ada14fa2998e8b48ad4a0ef6a916 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 11 Sep 2013 14:23:19 -0700
Subject: [PATCH 151/303] kernel/modsign_pubkey.c: fix init const for module
 signing code

const has to use __initconst, not __initdata

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Acked-by: David Howells <dhowells@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/modsign_pubkey.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c
index 2b6e69909c39..7cbd4507a7e6 100644
--- a/kernel/modsign_pubkey.c
+++ b/kernel/modsign_pubkey.c
@@ -18,14 +18,14 @@
 
 struct key *modsign_keyring;
 
-extern __initdata const u8 modsign_certificate_list[];
-extern __initdata const u8 modsign_certificate_list_end[];
+extern __initconst const u8 modsign_certificate_list[];
+extern __initconst const u8 modsign_certificate_list_end[];
 
 /*
  * We need to make sure ccache doesn't cache the .o file as it doesn't notice
  * if modsign.pub changes.
  */
-static __initdata const char annoy_ccache[] = __TIME__ "foo";
+static __initconst const char annoy_ccache[] = __TIME__ "foo";
 
 /*
  * Load the compiled-in keys

From a6b088875b5cfc2be95242826f31523214c083a7 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 11 Sep 2013 14:23:20 -0700
Subject: [PATCH 152/303] lto, watchdog/hpwdt.c: make assembler label global

We cannot assume that the inline assembler code always ends up in the same
file as the original C file.  So make any assembler labels that are called
with "extern" by C global

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Cc: Wim Van Sebroeck <wim@iguana.be>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/watchdog/hpwdt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index de7e4f497222..5be5e3d14f79 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -162,7 +162,8 @@ extern asmlinkage void asminline_call(struct cmn_registers *pi86Regs,
 #define HPWDT_ARCH	32
 
 asm(".text                          \n\t"
-    ".align 4                       \n"
+    ".align 4                       \n\t"
+    ".globl asminline_call	    \n"
     "asminline_call:                \n\t"
     "pushl       %ebp               \n\t"
     "movl        %esp, %ebp         \n\t"
@@ -352,7 +353,8 @@ static int detect_cru_service(void)
 #define HPWDT_ARCH	64
 
 asm(".text                      \n\t"
-    ".align 4                   \n"
+    ".align 4                   \n\t"
+    ".globl asminline_call	\n"
     "asminline_call:            \n\t"
     "pushq      %rbp            \n\t"
     "movq       %rsp, %rbp      \n\t"

From bc5c8f0783a4a2b43d05155782e71a22a91b26a5 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 11 Sep 2013 14:23:21 -0700
Subject: [PATCH 153/303] fs/bio-integrity: fix a potential mem leak

Free the bio_integrity_pool in the fail path of biovec_create_pool in
function bioset_integrity_create().

Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/bio-integrity.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 8fb42916d8a2..60250847929f 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -716,14 +716,15 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size)
 		return 0;
 
 	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
-
-	bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
-	if (!bs->bvec_integrity_pool)
-		return -1;
-
 	if (!bs->bio_integrity_pool)
 		return -1;
 
+	bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+	if (!bs->bvec_integrity_pool) {
+		mempool_destroy(bs->bio_integrity_pool);
+		return -1;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(bioset_integrity_create);

From 60c323699bb308404dcb60e8808531e02651578a Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 11 Sep 2013 14:23:22 -0700
Subject: [PATCH 154/303] kernel/smp.c: free related resources when failure
 occurs in hotplug_cfd()

When failure occurs in hotplug_cfd(), need release related resources, or
will cause memory leak.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Acked-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 449b707fc20d..3bb6ae533cdf 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 				cpu_to_node(cpu)))
 			return notifier_from_errno(-ENOMEM);
 		if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL,
-				cpu_to_node(cpu)))
+				cpu_to_node(cpu))) {
+			free_cpumask_var(cfd->cpumask);
 			return notifier_from_errno(-ENOMEM);
+		}
 		cfd->csd = alloc_percpu(struct call_single_data);
 		if (!cfd->csd) {
+			free_cpumask_var(cfd->cpumask_ipi);
 			free_cpumask_var(cfd->cpumask);
 			return notifier_from_errno(-ENOMEM);
 		}

From c14c338cb05c700a260480c197cfd6da8f8b7d2e Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 11 Sep 2013 14:23:23 -0700
Subject: [PATCH 155/303] kernel/spinlock.c: add default arch_*_relax
 definitions for GENERIC_LOCKBREAK

When running with GENERIC_LOCKBREAK=y, the locking implementations emit
calls to arch_{read,write,spin}_relax when spinning on a contended lock
in order to allow architectures to favour the CPU owning the lock if
possible.

In reality, everybody apart from PowerPC and S390 just does cpu_relax()
here, so make that the default behaviour and allow it to be overridden
if required.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/spinlock.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 5cdd8065a3ce..4b082b5cac9e 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -34,6 +34,20 @@
 #else
 #define raw_read_can_lock(l)	read_can_lock(l)
 #define raw_write_can_lock(l)	write_can_lock(l)
+
+/*
+ * Some architectures can relax in favour of the CPU owning the lock.
+ */
+#ifndef arch_read_relax
+# define arch_read_relax(l)	cpu_relax()
+#endif
+#ifndef arch_write_relax
+# define arch_write_relax(l)	cpu_relax()
+#endif
+#ifndef arch_spin_relax
+# define arch_spin_relax(l)	cpu_relax()
+#endif
+
 /*
  * We build the __lock_function inlines here. They are too large for
  * inlining all over the place, but here is only one user per function

From fa688207c9db48b64ab6538abc3fcdf26110b9ec Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Wed, 11 Sep 2013 14:23:24 -0700
Subject: [PATCH 156/303] smp: quit unconditionally enabling irq in
 on_each_cpu_mask and on_each_cpu_cond

As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in
!SMP version of on_each_cpu()"), we don't want to enable irqs if they
are not already enabled.  There are currently no known problematical
callers of these functions, but since it is a known failure pattern, we
preemptively fix them.

Since they are not trivial functions, make them non-inline by moving
them to up.c.  This also makes it so we don't have to fix #include
dependancies for preempt_{disable,enable}.

Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 62 ++++++++++++---------------------------------
 kernel/up.c         | 39 ++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 46 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index c8488763277f..3724a9070907 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -29,6 +29,22 @@ extern unsigned int total_cpus;
 int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 			     int wait);
 
+/*
+ * Call a function on processors specified by mask, which might include
+ * the local one.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+		void *info, bool wait);
+
+/*
+ * Call a function on each processor for which the supplied function
+ * cond_func returns a positive value. This may include the local
+ * processor.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+		smp_call_func_t func, void *info, bool wait,
+		gfp_t gfp_flags);
+
 #ifdef CONFIG_SMP
 
 #include <linux/preempt.h>
@@ -100,22 +116,6 @@ static inline void call_function_init(void) { }
  */
 int on_each_cpu(smp_call_func_t func, void *info, int wait);
 
-/*
- * Call a function on processors specified by mask, which might include
- * the local one.
- */
-void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
-		void *info, bool wait);
-
-/*
- * Call a function on each processor for which the supplied function
- * cond_func returns a positive value. This may include the local
- * processor.
- */
-void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
-		smp_call_func_t func, void *info, bool wait,
-		gfp_t gfp_flags);
-
 /*
  * Mark the boot cpu "online" so that it can call console drivers in
  * printk() and can access its per-cpu storage.
@@ -151,36 +151,6 @@ static inline int on_each_cpu(smp_call_func_t func, void *info, int wait)
 	return 0;
 }
 
-/*
- * Note we still need to test the mask even for UP
- * because we actually can get an empty mask from
- * code that on SMP might call us without the local
- * CPU in the mask.
- */
-#define on_each_cpu_mask(mask, func, info, wait) \
-	do {						\
-		if (cpumask_test_cpu(0, (mask))) {	\
-			local_irq_disable();		\
-			(func)(info);			\
-			local_irq_enable();		\
-		}					\
-	} while (0)
-/*
- * Preemption is disabled here to make sure the cond_func is called under the
- * same condtions in UP and SMP.
- */
-#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\
-	do {							\
-		void *__info = (info);				\
-		preempt_disable();				\
-		if ((cond_func)(0, __info)) {			\
-			local_irq_disable();			\
-			(func)(__info);				\
-			local_irq_enable();			\
-		}						\
-		preempt_enable();				\
-	} while (0)
-
 static inline void smp_send_reschedule(int cpu) { }
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_many(mask, func, info, wait) \
diff --git a/kernel/up.c b/kernel/up.c
index c54c75e9faf7..144e57255234 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -19,3 +19,42 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	return 0;
 }
 EXPORT_SYMBOL(smp_call_function_single);
+
+/*
+ * Note we still need to test the mask even for UP
+ * because we actually can get an empty mask from
+ * code that on SMP might call us without the local
+ * CPU in the mask.
+ */
+void on_each_cpu_mask(const struct cpumask *mask,
+		      smp_call_func_t func, void *info, bool wait)
+{
+	unsigned long flags;
+
+	if (cpumask_test_cpu(0, mask)) {
+		local_irq_save(flags);
+		func(info);
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * Preemption is disabled here to make sure the cond_func is called under the
+ * same condtions in UP and SMP.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+		      smp_call_func_t func, void *info, bool wait,
+		      gfp_t gfp_flags)
+{
+	unsigned long flags;
+
+	preempt_disable();
+	if (cond_func(0, info)) {
+		local_irq_save(flags);
+		func(info);
+		local_irq_restore(flags);
+	}
+	preempt_enable();
+}
+EXPORT_SYMBOL(on_each_cpu_cond);

From 081192b25c2d4620b5f5838620624d3daee94b66 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Wed, 11 Sep 2013 14:23:25 -0700
Subject: [PATCH 157/303] up.c: use local_irq_{save,restore}() in
 smp_call_function_single.

The SMP version of this function doesn't unconditionally enable irqs, so
neither should this !SMP version.  There are no know problems caused by
this, but we make the change for consistency's sake.

Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/up.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/up.c b/kernel/up.c
index 144e57255234..b1cf036255f3 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -10,11 +10,13 @@
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 				int wait)
 {
+	unsigned long flags;
+
 	WARN_ON(cpu != 0);
 
-	local_irq_disable();
-	(func)(info);
-	local_irq_enable();
+	local_irq_save(flags);
+	func(info);
+	local_irq_restore(flags);
 
 	return 0;
 }

From bff2dc42bcafdd75c0296987747f782965d691a0 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Wed, 11 Sep 2013 14:23:26 -0700
Subject: [PATCH 158/303] smp.h: move !SMP version of on_each_cpu() out-of-line

All of the other non-trivial !SMP versions of functions in smp.h are
out-of-line in up.c.  Move on_each_cpu() there as well.

This allows us to get rid of the #include <linux/irqflags.h>.  The
drawback is that this makes both the x86_64 and i386 defconfig !SMP
kernels about 200 bytes larger each.

Signed-off-by: David Daney <david.daney@cavium.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 21 +++++----------------
 kernel/up.c         | 11 +++++++++++
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 3724a9070907..cfb7ca094b38 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -11,7 +11,6 @@
 #include <linux/list.h>
 #include <linux/cpumask.h>
 #include <linux/init.h>
-#include <linux/irqflags.h>
 
 extern void cpu_idle(void);
 
@@ -29,6 +28,11 @@ extern unsigned int total_cpus;
 int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 			     int wait);
 
+/*
+ * Call a function on all processors
+ */
+int on_each_cpu(smp_call_func_t func, void *info, int wait);
+
 /*
  * Call a function on processors specified by mask, which might include
  * the local one.
@@ -111,11 +115,6 @@ void generic_smp_call_function_single_interrupt(void);
 static inline void call_function_init(void) { }
 #endif
 
-/*
- * Call a function on all processors
- */
-int on_each_cpu(smp_call_func_t func, void *info, int wait);
-
 /*
  * Mark the boot cpu "online" so that it can call console drivers in
  * printk() and can access its per-cpu storage.
@@ -141,16 +140,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info)
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
 
-static inline int on_each_cpu(smp_call_func_t func, void *info, int wait)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	func(info);
-	local_irq_restore(flags);
-	return 0;
-}
-
 static inline void smp_send_reschedule(int cpu) { }
 #define smp_prepare_boot_cpu()			do {} while (0)
 #define smp_call_function_many(mask, func, info, wait) \
diff --git a/kernel/up.c b/kernel/up.c
index b1cf036255f3..630d72bf7e41 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 }
 EXPORT_SYMBOL(smp_call_function_single);
 
+int on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	func(info);
+	local_irq_restore(flags);
+	return 0;
+}
+EXPORT_SYMBOL(on_each_cpu);
+
 /*
  * Note we still need to test the mask even for UP
  * because we actually can get an empty mask from

From e656a634118285142063527b2cd40c749036de82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= <u.kleine-koenig@pengutronix.de>
Date: Wed, 11 Sep 2013 14:23:27 -0700
Subject: [PATCH 159/303] extable: skip sorting if the table is empty
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

At least on ARM no-MMU the extable is empty and so there is nothing to
sort. So add a check for the table to be empty which effectively only
changes that the misleading pr_notice is suppressed.

Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: David Daney <david.daney@cavium.com>
Cc: "H. Peter Anvin" <hpa@linux.intel.com>
Cc: Borislav Petkov <bp@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/extable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/extable.c b/kernel/extable.c
index 67460b93b1a1..832cb28105bb 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1;
 /* Sort the kernel's built-in exception table */
 void __init sort_main_extable(void)
 {
-	if (main_extable_sort_needed) {
+	if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) {
 		pr_notice("Sorting __ex_table...\n");
 		sort_extable(__start___ex_table, __stop___ex_table);
 	}

From f9597f24c089dcbddbd2d9e99fbf00df57fb70c6 Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@gentoo.org>
Date: Wed, 11 Sep 2013 14:23:28 -0700
Subject: [PATCH 160/303] syscalls.h: add forward declarations for inplace
 syscall wrappers

Unclutter -Wmissing-prototypes warning types (enabled at make W=1)

    linux/include/linux/syscalls.h:190:18: warning: no previous prototype for 'SyS_semctl' [-Wmissing-prototypes]
      asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
                      ^
    linux/include/linux/syscalls.h:183:2: note: in expansion of macro '__SYSCALL_DEFINEx'
      __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
      ^
by adding forward declarations right before definitions.

Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compat.h   | 1 +
 include/linux/syscalls.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index ec1aee4aec9c..345da00a86e0 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -43,6 +43,7 @@
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
 	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
 	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+	asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));\
 	asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
 	{								\
 		return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));	\
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 84662ecc7b51..7fac04e7ff6e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -186,6 +186,7 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
+	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
 		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\

From 202da400570d991bacda4a06e878cb901e96a783 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Wed, 11 Sep 2013 14:23:29 -0700
Subject: [PATCH 161/303] kernel/smp.c: quit unconditionally enabling irqs in
 on_each_cpu_mask().

As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in
!SMP version of on_each_cpu()"), we don't want to enable irqs if they
are not already enabled.

I don't know of any bugs currently caused by this unconditional
local_irq_enable(), but I want to use this function in MIPS/OCTEON early
boot (when we have early_boot_irqs_disabled).  This also makes this
function have similar semantics to on_each_cpu() which is good in
itself.

Signed-off-by: David Daney <david.daney@cavium.com>
Cc: Gilad Ben-Yossef <gilad@benyossef.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/smp.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/smp.c b/kernel/smp.c
index 3bb6ae533cdf..0564571dcdf7 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -575,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu);
  *
  * If @wait is true, then returns once @func has returned.
  *
- * You must not call this function with disabled interrupts or
- * from a hardware interrupt handler or from a bottom half handler.
+ * You must not call this function with disabled interrupts or from a
+ * hardware interrupt handler or from a bottom half handler.  The
+ * exception is that it may be used during early boot while
+ * early_boot_irqs_disabled is set.
  */
 void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
 			void *info, bool wait)
@@ -585,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
 
 	smp_call_function_many(mask, func, info, wait);
 	if (cpumask_test_cpu(cpu, mask)) {
-		local_irq_disable();
+		unsigned long flags;
+		local_irq_save(flags);
 		func(info);
-		local_irq_enable();
+		local_irq_restore(flags);
 	}
 	put_cpu();
 }

From 205e550a0fb469ae73f91a903f27f4f63e774037 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:23:30 -0700
Subject: [PATCH 162/303] task_work: minor cleanups

Trivial.  Remove the unnecessary "work = NULL" initialization and turn
read_barrier_depends() into smp_read_barrier_depends() in
task_work_cancel().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/task_work.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 65bd3c92d6f3..6ee09856f725 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -25,7 +25,7 @@ struct callback_head *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
 	struct callback_head **pprev = &task->task_works;
-	struct callback_head *work = NULL;
+	struct callback_head *work;
 	unsigned long flags;
 	/*
 	 * If cmpxchg() fails we continue without updating pprev.
@@ -35,7 +35,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	 */
 	raw_spin_lock_irqsave(&task->pi_lock, flags);
 	while ((work = ACCESS_ONCE(*pprev))) {
-		read_barrier_depends();
+		smp_read_barrier_depends();
 		if (work->func != func)
 			pprev = &work->next;
 		else if (cmpxchg(pprev, work, work->next) == work)

From 892f6668f3a7088c7e30049c3d8e1844531602dc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:23:31 -0700
Subject: [PATCH 163/303] task_work: documentation

No functional changes, just comments.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/task_work.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/kernel/task_work.c b/kernel/task_work.c
index 6ee09856f725..8727032e3a6f 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -4,6 +4,23 @@
 
 static struct callback_head work_exited; /* all we need is ->next == NULL */
 
+/**
+ * task_work_add - ask the @task to execute @work->func()
+ * @task: the task which should run the callback
+ * @work: the callback to run
+ * @notify: send the notification if true
+ *
+ * Queue @work for task_work_run() below and notify the @task if @notify.
+ * Fails if the @task is exiting/exited and thus it can't process this @work.
+ * Otherwise @work->func() will be called when the @task returns from kernel
+ * mode or exits.
+ *
+ * This is like the signal handler which runs in kernel mode, but it doesn't
+ * try to wake up the @task.
+ *
+ * RETURNS:
+ * 0 if succeeds or -ESRCH.
+ */
 int
 task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 {
@@ -21,6 +38,17 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 	return 0;
 }
 
+/**
+ * task_work_cancel - cancel a pending work added by task_work_add()
+ * @task: the task which should execute the work
+ * @func: identifies the work to remove
+ *
+ * Find the last queued pending work with ->func == @func and remove
+ * it from queue.
+ *
+ * RETURNS:
+ * The found work or NULL if not found.
+ */
 struct callback_head *
 task_work_cancel(struct task_struct *task, task_work_func_t func)
 {
@@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func)
 	return work;
 }
 
+/**
+ * task_work_run - execute the works added by task_work_add()
+ *
+ * Flush the pending works. Should be used by the core kernel code.
+ * Called before the task returns to the user-mode or stops, or when
+ * it exits. In the latter case task_work_add() can no longer add the
+ * new work after task_work_run() returns.
+ */
 void task_work_run(void)
 {
 	struct task_struct *task = current;

From 9cedc3d51f30a7695264d0c5629b584b45e3938d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:32 -0700
Subject: [PATCH 164/303] MAINTAINERS: EXYNOS: remove board files

Commit ca9143501c30 ("ARM: EXYNOS: Remove unused board files") removed
the files, remove the patterns too.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Tomasz Figa <t.figa@samsung.com>
Acked-by: Kyungmin Park <kyungmin.park@samsung.com>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 --
 1 file changed, 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index d721af119ff9..c281e0f7cf80 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1179,8 +1179,6 @@ L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm/mach-s5pv210/mach-aquila.c
 F:	arch/arm/mach-s5pv210/mach-goni.c
-F:	arch/arm/mach-exynos/mach-universal_c210.c
-F:	arch/arm/mach-exynos/mach-nuri.c
 
 ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT
 M:	Kyungmin Park <kyungmin.park@samsung.com>

From 34b273ecb60c48eb8d9da129696ed216e60071a8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:33 -0700
Subject: [PATCH 165/303] MAINTAINERS: ARM: OMAP2/3: remove unused clockdomain
 files

Commit 4bd5259e53ac ("ARM: OMAP2/3: clockdomain/PRM/CM: move the
low-level clockdomain functions into PRM/CM") deleted the files, update
the pattern.

Identical to a patch earlier sent by Cesar Eduardo Barros.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Paul Walmsley <paul@pwsan.com>
Cc: Rajendra Nayak <rnayak@ti.com>
Cc: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Cesar Eduardo Barros <cesarb@cesarb.eti.br>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 --
 1 file changed, 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index c281e0f7cf80..930182bf3576 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5968,8 +5968,6 @@ L:	linux-omap@vger.kernel.org
 S:	Maintained
 F:	arch/arm/mach-omap2/powerdomain2xxx_3xxx.c
 F:	arch/arm/mach-omap2/powerdomain44xx.c
-F:	arch/arm/mach-omap2/clockdomain2xxx_3xxx.c
-F:	arch/arm/mach-omap2/clockdomain44xx.c
 
 OMAP AUDIO SUPPORT
 M:	Peter Ujfalusi <peter.ujfalusi@ti.com>

From d21db568d78cedc059d4f7a706a8502e688341e7 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:35 -0700
Subject: [PATCH 166/303] MAINTAINERS: OMAP POWERDOMAIN, update patterns

Commit 498153995b9f ("ARM: OMAP2+: powerdomain/PRM: move the low-level
powerdomain") renamed the files, update the patterns.

Identical to a patch earlier sent by Cesar Eduardo Barros.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Paul Walmsley <paul@pwsan.com>
Cc: Rajendra Nayak <rnayak@ti.com>
Cc: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Cesar Eduardo Barros <cesarb@cesarb.eti.br>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 930182bf3576..440d5b112377 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5961,13 +5961,12 @@ S:	Maintained
 F:	arch/arm/*omap*/*pm*
 F:	drivers/cpufreq/omap-cpufreq.c
 
-OMAP POWERDOMAIN/CLOCKDOMAIN SOC ADAPTATION LAYER SUPPORT
+OMAP POWERDOMAIN SOC ADAPTATION LAYER SUPPORT
 M:	Rajendra Nayak <rnayak@ti.com>
 M:	Paul Walmsley <paul@pwsan.com>
 L:	linux-omap@vger.kernel.org
 S:	Maintained
-F:	arch/arm/mach-omap2/powerdomain2xxx_3xxx.c
-F:	arch/arm/mach-omap2/powerdomain44xx.c
+F:	arch/arm/mach-omap2/prm*
 
 OMAP AUDIO SUPPORT
 M:	Peter Ujfalusi <peter.ujfalusi@ti.com>

From 15dba38737af134ddcac3270448c5e15eff2e323 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:36 -0700
Subject: [PATCH 167/303] MAINTAINERS: ARM: S3C2410: update patterns

Commit 85fd6d63bf29 ("ARM: S3C2410: move mach-s3c2410/* into
mach-s3c24xx/") moved the files, update the patterns.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Kukjin Kim <kgene.kim@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 440d5b112377..342fe744ac96 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7540,9 +7540,9 @@ P:	Vincent Sanders <vince@simtec.co.uk>
 M:	Simtec Linux Team <linux@simtec.co.uk>
 W:	http://www.simtec.co.uk/products/EB2410ITX/
 S:	Supported
-F:	arch/arm/mach-s3c2410/mach-bast.c
-F:	arch/arm/mach-s3c2410/bast-ide.c
-F:	arch/arm/mach-s3c2410/bast-irq.c
+F:	arch/arm/mach-s3c24xx/mach-bast.c
+F:	arch/arm/mach-s3c24xx/bast-ide.c
+F:	arch/arm/mach-s3c24xx/bast-irq.c
 
 TI DAVINCI MACHINE SUPPORT
 M:	Sekhar Nori <nsekhar@ti.com>

From 281e192f5e4db2a0fefcc04de2fabaa7a16294c8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:37 -0700
Subject: [PATCH 168/303] MAINTAINERS: ARM: spear: consolidate sections

Commit a7ed099ffc8e ("ARM: spear: move all files to mach-spear") moved
all the files into a single directory, delete the now unnecessary
duplicate sections and update the pattern.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 342fe744ac96..3f83a6de463e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7815,35 +7815,7 @@ L:	spear-devel@list.st.com
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 W:	http://www.st.com/spear
 S:	Maintained
-F:	arch/arm/plat-spear/
-
-SPEAR13XX MACHINE SUPPORT
-M:	Viresh Kumar <viresh.linux@gmail.com>
-M:	Shiraz Hashim <shiraz.hashim@st.com>
-L:	spear-devel@list.st.com
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-W:	http://www.st.com/spear
-S:	Maintained
-F:	arch/arm/mach-spear13xx/
-
-SPEAR3XX MACHINE SUPPORT
-M:	Viresh Kumar <viresh.linux@gmail.com>
-M:	Shiraz Hashim <shiraz.hashim@st.com>
-L:	spear-devel@list.st.com
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-W:	http://www.st.com/spear
-S:	Maintained
-F:	arch/arm/mach-spear3xx/
-
-SPEAR6XX MACHINE SUPPORT
-M:	Rajeev Kumar <rajeev-dlh.kumar@st.com>
-M:	Shiraz Hashim <shiraz.hashim@st.com>
-M:	Viresh Kumar <viresh.linux@gmail.com>
-L:	spear-devel@list.st.com
-L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-W:	http://www.st.com/spear
-S:	Maintained
-F:	arch/arm/mach-spear6xx/
+F:	arch/arm/mach-spear/
 
 SPEAR CLOCK FRAMEWORK SUPPORT
 M:	Viresh Kumar <viresh.linux@gmail.com>

From 5173413a15775d65982ab8a5f8e905a2aa529a87 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:38 -0700
Subject: [PATCH 169/303] MAINTAINERS: ARM: plat-nomadik: update patterns

Commit 694e33a7f42d ("ARM: plat-nomadik: move MTU, kill plat-nomadik")
moved the files, update the patterns.

Signed-off-by: Joe Perches <joe@perches.com>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3f83a6de463e..f1d176cd68e0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1048,7 +1048,6 @@ M:	STEricsson <STEricsson_nomadik_linux@list.st.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
 F:	arch/arm/mach-nomadik/
-F:	arch/arm/plat-nomadik/
 F:	drivers/i2c/busses/i2c-nomadik.c
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git
 

From c54ec9d3698b006fad5e942ae169fb5f079ccfc4 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:39 -0700
Subject: [PATCH 170/303] MAINTAINERS: ARM: S3C24XX: remove plat-s3c24xx

Commit 09ec1d7ea67f ("ARM: S3C24XX: Remove plat-s3c24xx directory in
arch/arm/") moved the files, remove the pattern.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index f1d176cd68e0..064daa840244 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1155,7 +1155,6 @@ L:	linux-samsung-soc@vger.kernel.org (moderated for non-subscribers)
 W:	http://www.fluff.org/ben/linux/
 S:	Maintained
 F:	arch/arm/plat-samsung/
-F:	arch/arm/plat-s3c24xx/
 F:	arch/arm/mach-s3c24*/
 F:	arch/arm/mach-s3c64xx/
 F:	drivers/*/*s3c2410*

From 2caa67a652c95c0175b7fb645e2924dad726b1ba Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:40 -0700
Subject: [PATCH 171/303] MAINTAINERS: ghes_edac: update pattern

Commit 77c5f5d2f212 ("ghes_edac: Register at EDAC core the BIOS report")
typoed the file pattern.  Fix it.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 064daa840244..141a0710c3ff 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3034,7 +3034,7 @@ M:	Mauro Carvalho Chehab <m.chehab@samsung.com>
 L:	linux-edac@vger.kernel.org
 W:	bluesmoke.sourceforge.net
 S:	Maintained
-F:	drivers/edac/ghes-edac.c
+F:	drivers/edac/ghes_edac.c
 
 EDAC-I82443BXGX
 M:	Tim Small <tim@buttersideup.com>

From 81a66488596f81d412c0e391a081f8b6778cdc9c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:41 -0700
Subject: [PATCH 172/303] MAINTAINERS: update SIANO drivers

Commit 786baecfe78f ("[media] dvb-usb: move it to
drivers/media/usb/dvb-usb") moved the files, update the pattern.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 141a0710c3ff..082272821476 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7498,7 +7498,7 @@ W:	http://linuxtv.org
 T:	git git://linuxtv.org/media_tree.git
 S:	Odd fixes
 F:	drivers/media/common/siano/
-F:	drivers/media/dvb/siano/
+F:	drivers/media/usb/siano/
 F:	drivers/media/usb/siano/
 F:	drivers/media/mmc/siano
 

From 559cdc828b51cace87bd297f5b92a9e8da9e1ba4 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:42 -0700
Subject: [PATCH 173/303] MAINTAINERS: SI4713: fix file pattern

Commit c937ca034a03 ("[media] MAINTAINERS: Add maintainer entry for
si4713 FM transmitter driver") typoed the pattern, fix it.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Eduardo Valentin <edubezval@gmail.com>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 082272821476..4835a9bdf95d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7489,7 +7489,7 @@ L:	linux-media@vger.kernel.org
 T:	git git://linuxtv.org/media_tree.git
 W:	http://linuxtv.org
 S:	Odd Fixes
-F:	drivers/media/radio/radio-si4713.h
+F:	drivers/media/radio/radio-si4713.c
 
 SIANO DVB DRIVER
 M:	Mauro Carvalho Chehab <m.chehab@samsung.com>

From 9d9fb74499c3ae468691d1118d3169da8a5b3857 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:43 -0700
Subject: [PATCH 174/303] MAINTAINERS: update it913x patterns

Commit d7104bffcfb7 ("[media] MAINTAINERS: add
drivers/media/tuners/it913x*") used the incorrect file patterns.  Fix
it.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Antti Palosaari <crope@iki.fi>
Cc: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4835a9bdf95d..198a228caaaa 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4623,7 +4623,7 @@ W:	http://palosaari.fi/linux/
 Q:	http://patchwork.linuxtv.org/project/linux-media/list/
 T:	git git://linuxtv.org/anttip/media_tree.git
 S:	Maintained
-F:	drivers/media/tuners/it913x*
+F:	drivers/media/tuners/tuner_it913x*
 
 IVTV VIDEO4LINUX DRIVER
 M:	Andy Walls <awalls@md.metrocast.net>

From 31a12b317804720e75bf68accc8f12fac24fcf64 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:44 -0700
Subject: [PATCH 175/303] MAINTAINERS: update ssbi patterns

Commit 45fcac1aad5d ("mfd: Move ssbi driver into drivers/mfd") move the
files, update the patterns.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 198a228caaaa..5804efdea32a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1069,7 +1069,7 @@ F:	drivers/mmc/host/msm_sdcc.h
 F:	drivers/tty/serial/msm_serial.h
 F:	drivers/tty/serial/msm_serial.c
 F:	drivers/*/pm8???-*
-F:	drivers/ssbi/
+F:	drivers/mfd/ssbi/
 F:	include/linux/mfd/pm8xxx/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/davidb/linux-msm.git
 S:	Maintained

From c6a0fe4a08c5190c8e36547526529a21c3597149 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:45 -0700
Subject: [PATCH 176/303] MAINTAINERS: update file pattern for ARC uart

Commit 6659a20a76e0 ("ARC: MAINTAINERS update for ARC") typoed the file
pattern.  Fix it.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5804efdea32a..a26049f63e7f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8082,7 +8082,7 @@ M:	Vineet Gupta <vgupta@synopsys.com>
 S:	Supported
 F:	arch/arc/
 F:	Documentation/devicetree/bindings/arc/
-F:	drivers/tty/serial/arc-uart.c
+F:	drivers/tty/serial/arc_uart.c
 
 SYSV FILESYSTEM
 M:	Christoph Hellwig <hch@infradead.org>

From 4f31102bbba356b907dc4ab2ddaf243a132b9ffd Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:46 -0700
Subject: [PATCH 177/303] MAINTAINERS: update USB EHCI platform pattern

Commit f3bc64d6d1f2 ("USB: EHCI: DT support for generic bus glue")
removed the ehci-vt8500.c file, update the file pattern to include
ehci-platform.c.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index a26049f63e7f..0a8418219c61 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1321,7 +1321,7 @@ F:	drivers/mmc/host/wmt-sdmmc.c
 F:	drivers/pwm/pwm-vt8500.c
 F:	drivers/rtc/rtc-vt8500.c
 F:	drivers/tty/serial/vt8500_serial.c
-F:	drivers/usb/host/ehci-vt8500.c
+F:	drivers/usb/host/ehci-platform.c
 F:	drivers/usb/host/uhci-platform.c
 F:	drivers/video/vt8500lcdfb.*
 F:	drivers/video/wm8505fb*

From 89f55bd74de13b45a239b2088bfcd803b0cf53c3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:47 -0700
Subject: [PATCH 178/303] MAINTAINERS: usb: phy: update patterns

Commit a0e631235a04 ("usb: phy: move all PHY drivers to
drivers/usb/phy/") deleted the files, remove the file pattern.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 0a8418219c61..5221b953ee19 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8772,7 +8772,6 @@ L:	linux-usb@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git
 S:	Maintained
 F:	drivers/usb/phy/
-F:	drivers/usb/otg/
 
 USB PRINTER DRIVER (usblp)
 M:	Pete Zaitcev <zaitcev@redhat.com>

From 11c26770eb0296956a6b17595e26f3a8eab3677a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:48 -0700
Subject: [PATCH 179/303] MAINTAINERS: update GRE DEMUX patterns

Commit c50cd357887a ("net: gre: move GSO functions to gre_offload")
renamed and separated the file into multiple files.  Update the
patterns.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Dmitry Kozlov <xeb@mail.ru>
Cc: Daniel Borkmann <dborkman@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 5221b953ee19..516777f7d6af 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3683,7 +3683,8 @@ GRE DEMULTIPLEXER DRIVER
 M:	Dmitry Kozlov <xeb@mail.ru>
 L:	netdev@vger.kernel.org
 S:	Maintained
-F:	net/ipv4/gre.c
+F:	net/ipv4/gre_demux.c
+F:	net/ipv4/gre_offload.c
 F:	include/net/gre.h
 
 GRETH 10/100/1G Ethernet MAC device driver

From af4b8e371b0f692a89f37d7ae5b23e8804c81a17 Mon Sep 17 00:00:00 2001
From: Christian Daudt <csd@broadcom.com>
Date: Wed, 11 Sep 2013 14:23:49 -0700
Subject: [PATCH 180/303] MAINTAINERS: add mach-bcm and drivers

Add ownership to maintainers file for the mach-bcm related files,
including drivers that are used for the SoCs defined in mach-bcm.

Signed-off-by: Christian Daudt <csd@broadcom.com>
Cc: Olof Johansson <olof@lixom.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 516777f7d6af..726d22258697 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1811,6 +1811,17 @@ L:	netdev@vger.kernel.org
 S:	Supported
 F:	drivers/net/ethernet/broadcom/bnx2x/
 
+BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE
+M:	Christian Daudt <csd@broadcom.com>
+T:	git git://git.github.com/broadcom/bcm11351
+S:	Maintained
+F:	arch/arm/mach-bcm/
+F:	arch/arm/boot/dts/bcm113*
+F:	arch/arm/boot/dts/bcm281*
+F:	arch/arm/configs/bcm_defconfig
+F:	drivers/mmc/host/sdhci_bcm_kona.c
+F:	drivers/clocksource/bcm_kona_timer.c
+
 BROADCOM BCM2835 ARM ARCHICTURE
 M:	Stephen Warren <swarren@wwwdotorg.org>
 L:	linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers)

From 144308139cdce95e9c6cff3cc0ef12242181665f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:50 -0700
Subject: [PATCH 181/303] MAINTAINERS: append "/" to directory patterns

It's clearer to have patterns marked as directories.

Change the directory patterns without terminating slashes.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 44 ++++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 726d22258697..f8c41ae6c9a4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1028,7 +1028,7 @@ F:	arch/arm/mach-orion5x/ts78xx-*
 ARM/MICREL KS8695 ARCHITECTURE
 M:	Greg Ungerer <gerg@uclinux.org>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
-F:	arch/arm/mach-ks8695
+F:	arch/arm/mach-ks8695/
 S:	Odd Fixes
 
 ARM/MIOA701 MACHINE SUPPORT
@@ -2042,10 +2042,10 @@ W:	http://ceph.com/
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
 S:	Supported
 F:	Documentation/filesystems/ceph.txt
-F:	fs/ceph
-F:	net/ceph
-F:	include/linux/ceph
-F:	include/linux/crush
+F:	fs/ceph/
+F:	net/ceph/
+F:	include/linux/ceph/
+F:	include/linux/crush/
 
 CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
 L:	linux-usb@vger.kernel.org
@@ -2342,7 +2342,7 @@ CPU POWER MONITORING SUBSYSTEM
 M:	Dominik Brodowski <linux@dominikbrodowski.net>
 M:	Thomas Renninger <trenn@suse.de>
 S:	Maintained
-F:	tools/power/cpupower
+F:	tools/power/cpupower/
 
 CPUSETS
 M:	Li Zefan <lizefan@huawei.com>
@@ -2780,7 +2780,7 @@ L:	intel-gfx@lists.freedesktop.org
 L:	dri-devel@lists.freedesktop.org
 T:	git git://people.freedesktop.org/~danvet/drm-intel
 S:	Supported
-F:	drivers/gpu/drm/i915
+F:	drivers/gpu/drm/i915/
 F:	include/drm/i915*
 F:	include/uapi/drm/i915*
 
@@ -2792,7 +2792,7 @@ M:	Kyungmin Park <kyungmin.park@samsung.com>
 L:	dri-devel@lists.freedesktop.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git
 S:	Supported
-F:	drivers/gpu/drm/exynos
+F:	drivers/gpu/drm/exynos/
 F:	include/drm/exynos*
 F:	include/uapi/drm/exynos*
 
@@ -3651,8 +3651,8 @@ M:	Arnd Bergmann <arnd@arndb.de>
 L:	linux-arch@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git
 S:	Maintained
-F:	include/asm-generic
-F:	include/uapi/asm-generic
+F:	include/asm-generic/
+F:	include/uapi/asm-generic/
 
 GENERIC UIO DRIVER FOR PCI DEVICES
 M:	"Michael S. Tsirkin" <mst@redhat.com>
@@ -3773,7 +3773,7 @@ L:	linux-media@vger.kernel.org
 T:	git git://linuxtv.org/media_tree.git
 W:	http://linuxtv.org
 S:	Odd Fixes
-F:	drivers/media/usb/hdpvr
+F:	drivers/media/usb/hdpvr/
 
 HWPOISON MEMORY FAILURE HANDLING
 M:	Andi Kleen <andi@firstfloor.org>
@@ -4581,7 +4581,7 @@ S:	Supported
 W:	http://www.openfabrics.org
 W:	www.open-iscsi.org
 Q:	http://patchwork.kernel.org/project/linux-rdma/list/
-F:	drivers/infiniband/ulp/iser
+F:	drivers/infiniband/ulp/iser/
 
 ISDN SUBSYSTEM
 M:	Karsten Keil <isdn@linux-pingi.de>
@@ -6142,7 +6142,7 @@ W:	http://openrisc.net
 L:	linux@lists.openrisc.net (moderated for non-subscribers)
 S:	Maintained
 T:	git git://openrisc.net/~jonas/linux
-F:	arch/openrisc
+F:	arch/openrisc/
 
 OPENVSWITCH
 M:	Jesse Gross <jesse@nicira.com>
@@ -6433,7 +6433,7 @@ M:	Jamie Iles <jamie@jamieiles.com>
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 T:	git git://github.com/jamieiles/linux-2.6-ji.git
 S:	Supported
-F:	arch/arm/mach-picoxcell
+F:	arch/arm/mach-picoxcell/
 F:	drivers/*/picoxcell*
 F:	drivers/*/*/picoxcell*
 
@@ -6706,7 +6706,7 @@ F:	drivers/spi/spi-pxa2xx*
 F:	drivers/usb/gadget/pxa2*
 F:	include/sound/pxa2xx-lib.h
 F:	sound/arm/pxa*
-F:	sound/soc/pxa
+F:	sound/soc/pxa/
 
 MMP SUPPORT
 M:	Eric Miao <eric.y.miao@gmail.com>
@@ -7159,7 +7159,7 @@ SAMSUNG AUDIO (ASoC) DRIVERS
 M:	Sangbeom Kim <sbkim73@samsung.com>
 L:	alsa-devel@alsa-project.org (moderated for non-subscribers)
 S:	Supported
-F:	sound/soc/samsung
+F:	sound/soc/samsung/
 
 SAMSUNG FRAMEBUFFER DRIVER
 M:	Jingoo Han <jg1.han@samsung.com>
@@ -7205,7 +7205,7 @@ SERIAL DRIVERS
 M:	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 L:	linux-serial@vger.kernel.org
 S:	Maintained
-F:	drivers/tty/serial
+F:	drivers/tty/serial/
 
 SYNOPSYS DESIGNWARE DMAC DRIVER
 M:	Viresh Kumar <viresh.linux@gmail.com>
@@ -7240,7 +7240,7 @@ TLG2300 VIDEO4LINUX-2 DRIVER
 M:	Huang Shijie <shijie8@gmail.com>
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 S:	Odd Fixes
-F:	drivers/media/usb/tlg2300
+F:	drivers/media/usb/tlg2300/
 
 SC1200 WDT DRIVER
 M:	Zwane Mwaikambo <zwane@arm.linux.org.uk>
@@ -7512,7 +7512,7 @@ S:	Odd fixes
 F:	drivers/media/common/siano/
 F:	drivers/media/usb/siano/
 F:	drivers/media/usb/siano/
-F:	drivers/media/mmc/siano
+F:	drivers/media/mmc/siano/
 
 SH_VEU V4L2 MEM2MEM DRIVER
 M:	Guennadi Liakhovetski <g.liakhovetski@gmx.de>
@@ -7561,7 +7561,7 @@ L:	davinci-linux-open-source@linux.davincidsp.com (moderated for non-subscribers
 T:	git git://gitorious.org/linux-davinci/linux-davinci.git
 Q:	http://patchwork.kernel.org/project/linux-davinci/list/
 S:	Supported
-F:	arch/arm/mach-davinci
+F:	arch/arm/mach-davinci/
 F:	drivers/i2c/busses/i2c-davinci.c
 
 TI DAVINCI SERIES MEDIA DRIVER
@@ -7646,7 +7646,7 @@ SMIA AND SMIA++ IMAGE SENSOR DRIVER
 M:	Sakari Ailus <sakari.ailus@iki.fi>
 L:	linux-media@vger.kernel.org
 S:	Maintained
-F:	drivers/media/i2c/smiapp
+F:	drivers/media/i2c/smiapp/
 F:	include/media/smiapp.h
 F:	drivers/media/i2c/smiapp-pll.c
 F:	drivers/media/i2c/smiapp-pll.h
@@ -9314,7 +9314,7 @@ M:	Matthew Garrett <matthew.garrett@nebula.com>
 L:	platform-driver-x86@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git
 S:	Maintained
-F:	drivers/platform/x86
+F:	drivers/platform/x86/
 
 X86 MCE INFRASTRUCTURE
 M:	Tony Luck <tony.luck@intel.com>

From 5ab58acc40ead5521cbafcd1cc7f35170ccceeee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Emilio=20L=C3=B3pez?= <emilio@elopez.com.ar>
Date: Wed, 11 Sep 2013 14:23:51 -0700
Subject: [PATCH 182/303] lib/genalloc.c: correct dev_get_gen_pool
 documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The documentation mentions a "name" parameter, which does not exist.  This
commit removes such mention from the function documentation.

Signed-off-by: Emilio López <emilio@elopez.com.ar>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/genalloc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/genalloc.c b/lib/genalloc.c
index c522facfa3e5..26cf20be72b7 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -524,7 +524,6 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 /**
  * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
  * @dev: device to retrieve the gen_pool from
- * @name: Optional name for the gen_pool, usually NULL
  *
  * Returns the gen_pool for the device if one is present, or NULL.
  */

From f2e1d2ac344bcee83f6496a641d37e86d55e33d4 Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Wed, 11 Sep 2013 14:23:52 -0700
Subject: [PATCH 183/303] lib/crc32: update the comments of
 crc32_{be,le}_generic()

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/crc32.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/lib/crc32.c b/lib/crc32.c
index 072fbd8234d5..410093dbe51c 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -131,11 +131,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 #endif
 
 /**
- * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32
- * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
- *	other uses, or the previous crc32 value if computing incrementally.
- * @p: pointer to buffer over which CRC is run
+ * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
+ *			CRC32/CRC32C
+ * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for other
+ *	 uses, or the previous crc32/crc32c value if computing incrementally.
+ * @p: pointer to buffer over which CRC32/CRC32C is run
  * @len: length of buffer @p
+ * @tab: little-endian Ethernet table
+ * @polynomial: CRC32/CRC32c LE polynomial
  */
 static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
 					  size_t len, const u32 (*tab)[256],
@@ -201,11 +204,13 @@ EXPORT_SYMBOL(crc32_le);
 EXPORT_SYMBOL(__crc32c_le);
 
 /**
- * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
+ * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
  *	other uses, or the previous crc32 value if computing incrementally.
- * @p: pointer to buffer over which CRC is run
+ * @p: pointer to buffer over which CRC32 is run
  * @len: length of buffer @p
+ * @tab: big-endian Ethernet table
+ * @polynomial: CRC32 BE polynomial
  */
 static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
 					  size_t len, const u32 (*tab)[256],

From 1431574a1c4c669a0c198e4763627837416e4443 Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Wed, 11 Sep 2013 14:23:53 -0700
Subject: [PATCH 184/303] lib/decompressors: fix "no limit" output buffer
 length

When decompressing into memory, the output buffer length is set to some
arbitrarily high value (0x7fffffff) to indicate the output is, virtually,
unlimited in size.

The problem with this is that some platforms have their physical memory at
high physical addresses (0x80000000 or more), and that the output buffer
address and its "unlimited" length cannot be added without overflowing.
An example of this can be found in inflate_fast():

/* next_out is the output buffer address */
out = strm->next_out - OFF;
/* avail_out is the output buffer size. end will overflow if the output
 * address is >= 0x80000104 */
end = out + (strm->avail_out - 257);

This has huge consequences on the performance of kernel decompression,
since the following exit condition of inflate_fast() will be always true:

} while (in < last && out < end);

Indeed, "end" has overflowed and is now always lower than "out".  As a
result, inflate_fast() will return after processing one single byte of
input data, and will thus need to be called an unreasonably high number of
times.  This probably went unnoticed because kernel decompression is fast
enough even with this issue.

Nonetheless, adjusting the output buffer length in such a way that the
above pointer arithmetic never overflows results in a kernel decompression
that is about 3 times faster on affected machines.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Tested-by: Jon Medhurst <tixy@linaro.org>
Cc: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/decompress_inflate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 19ff89e34eec..d619b28c456f 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -48,7 +48,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
 		out_len = 0x8000; /* 32 K */
 		out_buf = malloc(out_len);
 	} else {
-		out_len = 0x7fffffff; /* no limit */
+		out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
 	}
 	if (!out_buf) {
 		error("Out of memory while allocating output buffer");

From d5e616fc1c1dd673c53b682877e2d35a2862263c Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:54 -0700
Subject: [PATCH 185/303] checkpatch: add a few more --fix corrections

Suggest a few more single-line corrections.

Remove DOS line endings
Simplify removing trailing whitespace
Remove global/static initializations to 0/NULL
Convert pr_warning to pr_warn
Add space after brace
Convert binary constants to hex
Remove whitespace after line continuation
Use inline not __inline or __inline__
Use __printf and __scanf
Use a single ; for statement terminations
Convert __FUNCTION__ to __func__

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 122 +++++++++++++++++++++++++++++-------------
 1 file changed, 84 insertions(+), 38 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 2ee9eb750560..9163651edc50 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1845,15 +1845,17 @@ sub process {
 #trailing whitespace
 		if ($line =~ /^\+.*\015/) {
 			my $herevet = "$here\n" . cat_vet($rawline) . "\n";
-			ERROR("DOS_LINE_ENDINGS",
-			      "DOS line endings\n" . $herevet);
-
+			if (ERROR("DOS_LINE_ENDINGS",
+				  "DOS line endings\n" . $herevet) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/[\s\015]+$//;
+			}
 		} elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) {
 			my $herevet = "$here\n" . cat_vet($rawline) . "\n";
 			if (ERROR("TRAILING_WHITESPACE",
 				  "trailing whitespace\n" . $herevet) &&
 			    $fix) {
-				$fixed[$linenr - 1] =~ s/^(\+.*?)\s+$/$1/;
+				$fixed[$linenr - 1] =~ s/\s+$//;
 			}
 
 			$rpt_cleaners = 1;
@@ -2486,16 +2488,22 @@ sub process {
 		}
 
 # check for global initialisers.
-		if ($line =~ /^.$Type\s*$Ident\s*(?:\s+$Modifier)*\s*=\s*(0|NULL|false)\s*;/) {
-			ERROR("GLOBAL_INITIALISERS",
-			      "do not initialise globals to 0 or NULL\n" .
-				$herecurr);
+		if ($line =~ /^\+(\s*$Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/) {
+			if (ERROR("GLOBAL_INITIALISERS",
+				  "do not initialise globals to 0 or NULL\n" .
+				      $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/;
+			}
 		}
 # check for static initialisers.
-		if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) {
-			ERROR("INITIALISED_STATIC",
-			      "do not initialise statics to 0 or NULL\n" .
-				$herecurr);
+		if ($line =~ /^\+.*\bstatic\s.*=\s*(0|NULL|false)\s*;/) {
+			if (ERROR("INITIALISED_STATIC",
+				  "do not initialise statics to 0 or NULL\n" .
+				      $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/;
+			}
 		}
 
 # check for static const char * arrays.
@@ -2638,8 +2646,12 @@ sub process {
 		}
 
 		if ($line =~ /\bpr_warning\s*\(/) {
-			WARN("PREFER_PR_LEVEL",
-			     "Prefer pr_warn(... to pr_warning(...\n" . $herecurr);
+			if (WARN("PREFER_PR_LEVEL",
+				 "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~
+				    s/\bpr_warning\b/pr_warn/;
+			}
 		}
 
 		if ($line =~ /\bdev_printk\s*\(\s*KERN_([A-Z]+)/) {
@@ -3031,8 +3043,7 @@ sub process {
 			if (ERROR("SPACING",
 				  "space required before the open brace '{'\n" . $herecurr) &&
 			    $fix) {
-				$fixed[$linenr - 1] =~
-				    s/^(\+.*(?:do|\))){/$1 {/;
+				$fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/;
 			}
 		}
 
@@ -3047,8 +3058,12 @@ sub process {
 # closing brace should have a space following it when it has anything
 # on the line
 		if ($line =~ /}(?!(?:,|;|\)))\S/) {
-			ERROR("SPACING",
-			      "space required after that close brace '}'\n" . $herecurr);
+			if (ERROR("SPACING",
+				  "space required after that close brace '}'\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~
+				    s/}((?!(?:,|;|\)))\S)/} $1/;
+			}
 		}
 
 # check spacing on square brackets
@@ -3271,8 +3286,13 @@ sub process {
 
 #gcc binary extension
 			if ($var =~ /^$Binary$/) {
-				WARN("GCC_BINARY_CONSTANT",
-				     "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr);
+				if (WARN("GCC_BINARY_CONSTANT",
+					 "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) &&
+				    $fix) {
+					my $hexval = sprintf("0x%x", oct($var));
+					$fixed[$linenr - 1] =~
+					    s/\b$var\b/$hexval/;
+				}
 			}
 
 #CamelCase
@@ -3292,9 +3312,12 @@ sub process {
 		}
 
 #no spaces allowed after \ in define
-		if ($line=~/\#\s*define.*\\\s$/) {
-			WARN("WHITESPACE_AFTER_LINE_CONTINUATION",
-			     "Whitepspace after \\ makes next lines useless\n" . $herecurr);
+		if ($line =~ /\#\s*define.*\\\s+$/) {
+			if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION",
+				 "Whitespace after \\ makes next lines useless\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/\s+$//;
+			}
 		}
 
 #warn if <asm/foo.h> is #included and <linux/foo.h> is available (uses RAW line)
@@ -3691,8 +3714,12 @@ sub process {
 
 # Check for __inline__ and __inline, prefer inline
 		if ($line =~ /\b(__inline__|__inline)\b/) {
-			WARN("INLINE",
-			     "plain inline is preferred over $1\n" . $herecurr);
+			if (WARN("INLINE",
+				 "plain inline is preferred over $1\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/;
+
+			}
 		}
 
 # Check for __attribute__ packed, prefer __packed
@@ -3709,14 +3736,21 @@ sub process {
 
 # Check for __attribute__ format(printf, prefer __printf
 		if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) {
-			WARN("PREFER_PRINTF",
-			     "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr);
+			if (WARN("PREFER_PRINTF",
+				 "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex;
+
+			}
 		}
 
 # Check for __attribute__ format(scanf, prefer __scanf
 		if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\b/) {
-			WARN("PREFER_SCANF",
-			     "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr);
+			if (WARN("PREFER_SCANF",
+				 "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex;
+			}
 		}
 
 # check for sizeof(&)
@@ -3727,8 +3761,11 @@ sub process {
 
 # check for sizeof without parenthesis
 		if ($line =~ /\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/) {
-			WARN("SIZEOF_PARENTHESIS",
-			     "sizeof $1 should be sizeof($1)\n" . $herecurr);
+			if (WARN("SIZEOF_PARENTHESIS",
+				 "sizeof $1 should be sizeof($1)\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex;
+			}
 		}
 
 # check for line continuations in quoted strings with odd counts of "
@@ -3747,8 +3784,11 @@ sub process {
 		if ($line =~ /\bseq_printf\s*\(/) {
 			my $fmt = get_quoted_string($line, $rawline);
 			if ($fmt !~ /[^\\]\%/) {
-				WARN("PREFER_SEQ_PUTS",
-				     "Prefer seq_puts to seq_printf\n" . $herecurr);
+				if (WARN("PREFER_SEQ_PUTS",
+					 "Prefer seq_puts to seq_printf\n" . $herecurr) &&
+				    $fix) {
+					$fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/;
+				}
 			}
 		}
 
@@ -3879,8 +3919,11 @@ sub process {
 
 # check for multiple semicolons
 		if ($line =~ /;\s*;\s*$/) {
-			WARN("ONE_SEMICOLON",
-			     "Statements terminations use 1 semicolon\n" . $herecurr);
+			if (WARN("ONE_SEMICOLON",
+				 "Statements terminations use 1 semicolon\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g;
+			}
 		}
 
 # check for switch/default statements without a break;
@@ -3898,9 +3941,12 @@ sub process {
 		}
 
 # check for gcc specific __FUNCTION__
-		if ($line =~ /__FUNCTION__/) {
-			WARN("USE_FUNC",
-			     "__func__ should be used instead of gcc specific __FUNCTION__\n"  . $herecurr);
+		if ($line =~ /\b__FUNCTION__\b/) {
+			if (WARN("USE_FUNC",
+				 "__func__ should be used instead of gcc specific __FUNCTION__\n"  . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g;
+			}
 		}
 
 # check for use of yield()

From 7e781f67df436b67753a65436c0fef0a0ebf5043 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:55 -0700
Subject: [PATCH 186/303] checkpatch: check CamelCase by word, not by $Lval

$Lval is a test for complete name (ie: foo->bar.Baz[1])

If any of this is CamelCase, then the current test uses the entire $Lval.
This isn't optimal because it can emit messages with foo->bar.Baz and
bar.Baz when Baz is a variable specified in an include file.

So instead, break the $Lval into words and check each word for CamelCase
uses.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 9163651edc50..6b409b0ad457 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3302,11 +3302,15 @@ sub process {
 			    $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ &&
 #Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show)
 			    $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) {
-				seed_camelcase_includes() if ($check);
-				if (!defined $camelcase{$var}) {
-					$camelcase{$var} = 1;
-					CHK("CAMELCASE",
-					    "Avoid CamelCase: <$var>\n" . $herecurr);
+				while ($var =~ m{($Ident)}g) {
+					my $word = $1;
+					next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/);
+					seed_camelcase_includes() if ($check);
+					if (!defined $camelcase{$word}) {
+						$camelcase{$word} = 1;
+						CHK("CAMELCASE",
+						    "Avoid CamelCase: <$word>\n" . $herecurr);
+					}
 				}
 			}
 		}

From d62a201f24cba74e2fbf9f6f7af86ff5f5e276fc Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 11 Sep 2013 14:23:56 -0700
Subject: [PATCH 187/303] checkpatch: enforce sane perl version

I got a bug report from a couple of users who said checkpatch.pl was
broken for them.  It was erroring out on fairly random lines most commonly
with messages like:

	Nested quantifiers in regex; marked by <--HERE in m/(\((?:[^\(\)]++ <-- HERE |(?-1))*\))/ at ./checkpatch.pl line 340.

The bug reporter was running a version of perl 5.8 which was end-of-lifed
in 2008: http://www.cpan.org/src/.  Versions of perl this old are at
_best_ quite untested.  At worst, they are crusty and known to be
completely broken.

If folks have a system _that_ old, then we should have mercy on them and
give them a half-decent error message rather than fail with nutty error
messages.

This patch enforces that checkpatch.pl is run with perl 5.10, which was
end-of-lifed in 2009.  The new --ignore-perl-version command-line switch
will let folks override this if they want.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Joe Perches <joe@perches.com>
Cc: Andy Whitcroft <apw@shadowen.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 6b409b0ad457..c00e5108c0d2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -37,6 +37,8 @@ my @ignore = ();
 my $help = 0;
 my $configuration_file = ".checkpatch.conf";
 my $max_line_length = 80;
+my $ignore_perl_version = 0;
+my $minimum_perl_version = 5.10.0;
 
 sub help {
 	my ($exitcode) = @_;
@@ -71,6 +73,8 @@ Options:
                              "<inputfile>.EXPERIMENTAL-checkpatch-fixes"
                              with potential errors corrected to the preferred
                              checkpatch style
+  --ignore-perl-version      override checking of perl version.  expect
+                             runtime errors.
   -h, --help, --version      display this help and exit
 
 When FILE is - read standard input.
@@ -123,6 +127,7 @@ GetOptions(
 	'mailback!'	=> \$mailback,
 	'summary-file!'	=> \$summary_file,
 	'fix!'		=> \$fix,
+	'ignore-perl-version!' => \$ignore_perl_version,
 	'debug=s'	=> \%debug,
 	'test-only=s'	=> \$tst_only,
 	'h|help'	=> \$help,
@@ -133,6 +138,13 @@ help(0) if ($help);
 
 my $exit = 0;
 
+if ($^V && $^V lt $minimum_perl_version) {
+	printf "$P: requires at least perl version %vd\n", $minimum_perl_version;
+	if (!$ignore_perl_version) {
+		exit(1);
+	}
+}
+
 if ($#ARGV < 0) {
 	print "$P: no input files\n";
 	exit(1);

From 7e51f1979237e01bcd4e04e434c5da79151f08f8 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:57 -0700
Subject: [PATCH 188/303] checkpatch: check for duplicate signatures

Emit a warning when a signature is used more than once.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index c00e5108c0d2..7c79c91662c8 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1544,6 +1544,7 @@ sub process {
 	my %suppress_export;
 	my $suppress_statement = 0;
 
+	my %signatures = ();
 
 	# Pre-scan the patch sanitizing the lines.
 	# Pre-scan the patch looking for any __setup documentation.
@@ -1793,6 +1794,17 @@ sub process {
 					     "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr);
 				}
 			}
+
+# Check for duplicate signatures
+			my $sig_nospace = $line;
+			$sig_nospace =~ s/\s//g;
+			$sig_nospace = lc($sig_nospace);
+			if (defined $signatures{$sig_nospace}) {
+				WARN("BAD_SIGN_OFF",
+				     "Duplicate signature\n" . $herecurr);
+			} else {
+				$signatures{$sig_nospace} = 1;
+			}
 		}
 
 # Check for wrappage within a valid hunk of the file

From 70dc8a48357ce630d8a76887a9a36f0d34c8caf2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:58 -0700
Subject: [PATCH 189/303] checkpatch: warn when using extern with function
 prototypes in .h files

Using the extern keyword on function prototypes is superfluous visual
noise so suggest removing it.

Using extern can cause unnecessary line wrapping at 80 columns and
unnecessarily long multi-line function prototypes.

Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 7c79c91662c8..e2cb1f4621b7 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3878,6 +3878,16 @@ sub process {
 			}
 		}
 
+# check for new externs in .h files.
+		if ($realfile =~ /\.h$/ &&
+		    $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) {
+			if (WARN("AVOID_EXTERNS",
+				 "extern prototypes should be avoided in .h files\n" . $herecurr) &&
+			    $fix) {
+				$fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/;
+			}
+		}
+
 # check for new externs in .c files.
 		if ($realfile =~ /\.c$/ && defined $stat &&
 		    $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s)

From 61135e966367eda5056504ffd2f7518eaf77e25b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:59 -0700
Subject: [PATCH 190/303] checkpatch: fix networking kernel-doc block comment
 defect

checkpatch can generate a false positive when inserting a new kernel-doc
block and function above an existing kernel-doc block.

Fix it by checking that the context line is also a newly inserted line.

Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: Darren Hart <dvhart@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e2cb1f4621b7..9185883f5885 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2086,6 +2086,7 @@ sub process {
 		if ($realfile =~ m@^(drivers/net/|net/)@ &&
 		    $prevrawline =~ /^\+[ \t]*\/\*/ &&		#starting /*
 		    $prevrawline !~ /\*\/[ \t]*$/ &&		#no trailing */
+		    $rawline =~ /^\+/ &&			#line is new
 		    $rawline !~ /^\+[ \t]*\*/) {		#no leading *
 			WARN("NETWORKING_BLOCK_COMMENT_STYLE",
 			     "networking block comments start with * on subsequent lines\n" . $hereprev);

From 91bfe4843dff4426ca3a0dd1dab8454c1534022d Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:23:59 -0700
Subject: [PATCH 191/303] checkpatch: add --types option to report only
 specific message types

Add a --types convenience option to show only specific message types.
Combined with the --fix option, this can produce specific suggested
formatting patches to files.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 56 +++++++++++++++++++++++++++++--------------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 9185883f5885..3ba2db637384 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -31,8 +31,10 @@ my $show_types = 0;
 my $fix = 0;
 my $root;
 my %debug;
-my %ignore_type = ();
 my %camelcase = ();
+my %use_type = ();
+my @use = ();
+my %ignore_type = ();
 my @ignore = ();
 my $help = 0;
 my $configuration_file = ".checkpatch.conf";
@@ -56,6 +58,7 @@ Options:
   --terse                    one line per report
   -f, --file                 treat FILE as regular source file
   --subjective, --strict     enable more subjective tests
+  --types TYPE(,TYPE2...)    show only these comma separated message types
   --ignore TYPE(,TYPE2...)   ignore various comma separated message types
   --max-line-length=n        set the maximum line length, if exceeded, warn
   --show-types               show the message "types" in the output
@@ -120,6 +123,7 @@ GetOptions(
 	'subjective!'	=> \$check,
 	'strict!'	=> \$check,
 	'ignore=s'	=> \@ignore,
+	'types=s'	=> \@use,
 	'show-types!'	=> \$show_types,
 	'max-line-length=i' => \$max_line_length,
 	'root=s'	=> \$root,
@@ -150,19 +154,38 @@ if ($#ARGV < 0) {
 	exit(1);
 }
 
-@ignore = split(/,/, join(',',@ignore));
-foreach my $word (@ignore) {
-	$word =~ s/\s*\n?$//g;
-	$word =~ s/^\s*//g;
-	$word =~ s/\s+/ /g;
-	$word =~ tr/[a-z]/[A-Z]/;
+sub hash_save_array_words {
+	my ($hashRef, $arrayRef) = @_;
 
-	next if ($word =~ m/^\s*#/);
-	next if ($word =~ m/^\s*$/);
+	my @array = split(/,/, join(',', @$arrayRef));
+	foreach my $word (@array) {
+		$word =~ s/\s*\n?$//g;
+		$word =~ s/^\s*//g;
+		$word =~ s/\s+/ /g;
+		$word =~ tr/[a-z]/[A-Z]/;
 
-	$ignore_type{$word}++;
+		next if ($word =~ m/^\s*#/);
+		next if ($word =~ m/^\s*$/);
+
+		$hashRef->{$word}++;
+	}
 }
 
+sub hash_show_words {
+	my ($hashRef, $prefix) = @_;
+
+	if ($quiet == 0 && keys $hashRef) {
+		print "NOTE: $prefix message types:";
+		foreach my $word (sort keys $hashRef) {
+			print " $word";
+		}
+		print "\n\n";
+	}
+}
+
+hash_save_array_words(\%ignore_type, \@ignore);
+hash_save_array_words(\%use_type, \@use);
+
 my $dbg_values = 0;
 my $dbg_possible = 0;
 my $dbg_type = 0;
@@ -1367,7 +1390,9 @@ sub possible {
 my $prefix = '';
 
 sub show_type {
-       return !defined $ignore_type{$_[0]};
+	return defined $use_type{$_[0]} if (scalar keys %use_type > 0);
+
+	return !defined $ignore_type{$_[0]};
 }
 
 sub report {
@@ -4190,13 +4215,8 @@ sub process {
 		}
 	}
 
-	if ($quiet == 0 && keys %ignore_type) {
-	    print "NOTE: Ignored message types:";
-	    foreach my $ignore (sort keys %ignore_type) {
-		print " $ignore";
-	    }
-	    print "\n\n";
-	}
+	hash_show_words(\%use_type, "Used");
+	hash_show_words(\%ignore_type, "Ignored");
 
 	if ($clean == 0 && $fix && "@rawlines" ne "@fixed") {
 		my $newfile = $filename . ".EXPERIMENTAL-checkpatch-fixes";

From f95a7e6a462ed1338bd576ccb557ff86772a0776 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:24:00 -0700
Subject: [PATCH 192/303] checkpatch: ignore #define TRACE_<foo> macros

The tracing subsystem uses slightly odd #defines to set path/directory
locations for include files.

These #defines can cause false positives for the complex macro tests so
add exclusions for these specific #defines (TRACE_SYSTEM,
TRACE_INCLUDE_FILE, TRACE_INCLUDE_PATH).

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Sarah Sharp <sarah.a.sharp@linux.intel.com>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 3ba2db637384..db7778a8f414 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -3451,7 +3451,8 @@ sub process {
 			    $dstat !~ /^for\s*$Constant$/ &&				# for (...)
 			    $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ &&	# for (...) bar()
 			    $dstat !~ /^do\s*{/ &&					# do {...
-			    $dstat !~ /^\({/)						# ({...
+			    $dstat !~ /^\({/ &&						# ({...
+			    $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/)
 			{
 				$ctx =~ s/\n*$//;
 				my $herectx = $here . "\n";

From b34c648bb33ca143b98851fd7fe7250f1875c463 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:24:01 -0700
Subject: [PATCH 193/303] checkpatch: better --fix of SPACING errors.

Previous attempt at fixing SPACING errors could make a hash of several
defects.

This patch should make --fix be a lot better at correcting these defects.

Trim left and right sides of these defects appropriately instead of a
somewhat random attempt at it.

Trim left spaces from any following bit of the modified line when only a
single space is required around an operator.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Phil Carmody <phil.carmody@partner.samsung.com>
Cc: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 64 ++++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index db7778a8f414..e53df2b086b2 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1472,7 +1472,23 @@ sub check_absolute_file {
 sub trim {
 	my ($string) = @_;
 
-	$string =~ s/(^\s+|\s+$)//g;
+	$string =~ s/^\s+|\s+$//g;
+
+	return $string;
+}
+
+sub ltrim {
+	my ($string) = @_;
+
+	$string =~ s/^\s+//;
+
+	return $string;
+}
+
+sub rtrim {
+	my ($string) = @_;
+
+	$string =~ s/\s+$//;
 
 	return $string;
 }
@@ -2821,6 +2837,7 @@ sub process {
 			$off = 0;
 
 			my $blank = copy_spacing($opline);
+			my $last_after = -1;
 
 			for (my $n = 0; $n < $#elements; $n += 2) {
 
@@ -2886,7 +2903,7 @@ sub process {
 					    $cc !~ /^\\/ && $cc !~ /^;/) {
 						if (ERROR("SPACING",
 							  "space required after that '$op' $at\n" . $hereptr)) {
-							$good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
+							$good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
 							$line_fixed = 1;
 						}
 					}
@@ -2901,11 +2918,11 @@ sub process {
 					if ($ctx =~ /Wx.|.xW/) {
 						if (ERROR("SPACING",
 							  "spaces prohibited around that '$op' $at\n" . $hereptr)) {
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
-							$line_fixed = 1;
+							$good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
 							if (defined $fix_elements[$n + 2]) {
 								$fix_elements[$n + 2] =~ s/^\s+//;
 							}
+							$line_fixed = 1;
 						}
 					}
 
@@ -2914,8 +2931,9 @@ sub process {
 					if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) {
 						if (ERROR("SPACING",
 							  "space required after that '$op' $at\n" . $hereptr)) {
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " ";
+							$good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
 							$line_fixed = 1;
+							$last_after = $n;
 						}
 					}
 
@@ -2932,8 +2950,10 @@ sub process {
 					if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) {
 						if (ERROR("SPACING",
 							  "space required before that '$op' $at\n" . $hereptr)) {
-							$good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]);
-							$line_fixed = 1;
+							if ($n != $last_after + 2) {
+								$good = $fix_elements[$n] . " " . ltrim($fix_elements[$n + 1]);
+								$line_fixed = 1;
+							}
 						}
 					}
 					if ($op eq '*' && $cc =~/\s*$Modifier\b/) {
@@ -2942,12 +2962,11 @@ sub process {
 					} elsif ($ctx =~ /.xW/) {
 						if (ERROR("SPACING",
 							  "space prohibited after that '$op' $at\n" . $hereptr)) {
-							$fixed_line =~ s/\s+$//;
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
-							$line_fixed = 1;
+							$good = $fix_elements[$n] . rtrim($fix_elements[$n + 1]);
 							if (defined $fix_elements[$n + 2]) {
 								$fix_elements[$n + 2] =~ s/^\s+//;
 							}
+							$line_fixed = 1;
 						}
 					}
 
@@ -2956,8 +2975,7 @@ sub process {
 					if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) {
 						if (ERROR("SPACING",
 							  "space required one side of that '$op' $at\n" . $hereptr)) {
-							$fixed_line =~ s/\s+$//;
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " ";
+							$good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " ";
 							$line_fixed = 1;
 						}
 					}
@@ -2965,20 +2983,18 @@ sub process {
 					    ($ctx =~ /Wx./ && $cc =~ /^;/)) {
 						if (ERROR("SPACING",
 							  "space prohibited before that '$op' $at\n" . $hereptr)) {
-							$fixed_line =~ s/\s+$//;
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
+							$good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
 							$line_fixed = 1;
 						}
 					}
 					if ($ctx =~ /ExW/) {
 						if (ERROR("SPACING",
 							  "space prohibited after that '$op' $at\n" . $hereptr)) {
-							$fixed_line =~ s/\s+$//;
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
-							$line_fixed = 1;
+							$good = $fix_elements[$n] . trim($fix_elements[$n + 1]);
 							if (defined $fix_elements[$n + 2]) {
 								$fix_elements[$n + 2] =~ s/^\s+//;
 							}
+							$line_fixed = 1;
 						}
 					}
 
@@ -2992,8 +3008,10 @@ sub process {
 					if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) {
 						if (ERROR("SPACING",
 							  "need consistent spacing around '$op' $at\n" . $hereptr)) {
-							$fixed_line =~ s/\s+$//;
-							$good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
+							$good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
+							if (defined $fix_elements[$n + 2]) {
+								$fix_elements[$n + 2] =~ s/^\s+//;
+							}
 							$line_fixed = 1;
 						}
 					}
@@ -3004,7 +3022,7 @@ sub process {
 					if ($ctx =~ /Wx./) {
 						if (ERROR("SPACING",
 							  "space prohibited before that '$op' $at\n" . $hereptr)) {
-							$good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
+							$good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]);
 							$line_fixed = 1;
 						}
 					}
@@ -3031,8 +3049,10 @@ sub process {
 					if ($ok == 0) {
 						if (ERROR("SPACING",
 							  "spaces required around that '$op' $at\n" . $hereptr)) {
-							$good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
-							$good = $fix_elements[$n] . " " . trim($fix_elements[$n + 1]) . " ";
+							$good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " ";
+							if (defined $fix_elements[$n + 2]) {
+								$fix_elements[$n + 2] =~ s/^\s+//;
+							}
 							$line_fixed = 1;
 						}
 					}

From 1b5539b1ffbdcf7113eebea7f37141d4468d9070 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:24:03 -0700
Subject: [PATCH 194/303] checkpatch: reduce runtime/cpu time used

There are some cases where checkpatch can take a long time to complete.
Reduce the likelihood of this long run-time by adding a new test for lines
with and without comments and eliminating checks on lines with only
comments.

This reduces the number of "ctx_statement_block" calls, and also the
number of tests of $stat, which is now undefined for these blank lines.

One test in particular, the "check for switch/default statements without a
break", could take an extremely long time to parse as it tries to skip
interleaving comments within the ctx_statement_block/$stat and that could
be done multiple times unnecessarily.

A small test case taken from cfg80211.h before this patch would take
1000's of seconds to run, now it's just a couple seconds.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index e53df2b086b2..55277a8e1527 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1678,6 +1678,8 @@ sub process {
 	$linenr = 0;
 	foreach my $line (@lines) {
 		$linenr++;
+		my $sline = $line;	#copy of $line
+		$sline =~ s/$;/ /g;	#with comments as spaces
 
 		my $rawline = $rawlines[$linenr - 1];
 
@@ -2194,7 +2196,7 @@ sub process {
 		    $realline_next);
 #print "LINE<$line>\n";
 		if ($linenr >= $suppress_statement &&
-		    $realcnt && $line =~ /.\s*\S/) {
+		    $realcnt && $sline =~ /.\s*\S/) {
 			($stat, $cond, $line_nr_next, $remain_next, $off_next) =
 				ctx_statement_block($linenr, $realcnt, 0);
 			$stat =~ s/\n./\n /g;

From 58cb3cf66cc6330910316abb1dc7a7aa78917a27 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:24:04 -0700
Subject: [PATCH 195/303] checkpatch: fix perl version 5.12 and earlier
 incompatibility

A previous patch ("checkpatch: add --types option to report only
specific message types") uses a perl syntax introduced in perl version
5.14.

Use the backward compatible perl syntax instead.

Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: Julia Lawall <julia.lawall@lip6.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 55277a8e1527..9ba4fc44112a 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -174,9 +174,9 @@ sub hash_save_array_words {
 sub hash_show_words {
 	my ($hashRef, $prefix) = @_;
 
-	if ($quiet == 0 && keys $hashRef) {
+	if ($quiet == 0 && keys %$hashRef) {
 		print "NOTE: $prefix message types:";
-		foreach my $word (sort keys $hashRef) {
+		foreach my $word (sort keys %$hashRef) {
 			print " $word";
 		}
 		print "\n\n";

From 8716de383b82f16d920513138f1691e40ef5a9e3 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:24:05 -0700
Subject: [PATCH 196/303] checkpatch: add test for positional misuse of section
 specifiers like __initdata

As discussed recently on the arm [1] and lm-sensors [2] lists, it is
possible to use section markers on variables in a way which gcc doesn't
understand (or at least not the way the developer intended):

static struct __initdata samsung_pll_clock exynos4_plls[nr_plls] = {

does NOT put exynos4_plls in the .initdata section.  The __initdata marker
can be virtually anywhere on the line, EXCEPT right after "struct".  The
preferred location is before the "=" sign if there is one, or before the
trailing ";" otherwise.

[1] http://permalink.gmane.org/gmane.linux.ports.arm.kernel/258149
[2] http://lists.lm-sensors.org/pipermail/lm-sensors/2013-August/039836.html

So, update checkpatch to find these misuses and report an error when it's
immediately after struct or union, and a warning when it's otherwise not
immediately before the ; or =.

A similar patch was suggested by Andi Kleen
https://lkml.org/lkml/2013/8/5/648

Signed-off-by: Joe Perches <joe@perches.com>
Suggested-by: Jean Delvare <khali@linux-fr.org>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 scripts/checkpatch.pl | 47 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index 9ba4fc44112a..47016c304c84 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -242,6 +242,8 @@ our $Sparse	= qr{
 			__rcu
 		}x;
 
+our $InitAttribute = qr{__(?:mem|cpu|dev|net_|)(?:initdata|initconst|init\b)};
+
 # Notes to $Attribute:
 # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check
 our $Attribute	= qr{
@@ -262,7 +264,7 @@ our $Attribute	= qr{
 			__deprecated|
 			__read_mostly|
 			__kprobes|
-			__(?:mem|cpu|dev|)(?:initdata|initconst|init\b)|
+			$InitAttribute|
 			____cacheline_aligned|
 			____cacheline_aligned_in_smp|
 			____cacheline_internodealigned_in_smp|
@@ -292,6 +294,7 @@ our $Operators	= qr{
 		  }x;
 
 our $NonptrType;
+our $NonptrTypeWithAttr;
 our $Type;
 our $Declare;
 
@@ -354,6 +357,12 @@ our @typeList = (
 	qr{${Ident}_handler},
 	qr{${Ident}_handler_fn},
 );
+our @typeListWithAttr = (
+	@typeList,
+	qr{struct\s+$InitAttribute\s+$Ident},
+	qr{union\s+$InitAttribute\s+$Ident},
+);
+
 our @modifierList = (
 	qr{fastcall},
 );
@@ -367,6 +376,7 @@ our $allowed_asm_includes = qr{(?x:
 sub build_types {
 	my $mods = "(?x:  \n" . join("|\n  ", @modifierList) . "\n)";
 	my $all = "(?x:  \n" . join("|\n  ", @typeList) . "\n)";
+	my $allWithAttr = "(?x:  \n" . join("|\n  ", @typeListWithAttr) . "\n)";
 	$Modifier	= qr{(?:$Attribute|$Sparse|$mods)};
 	$NonptrType	= qr{
 			(?:$Modifier\s+|const\s+)*
@@ -377,6 +387,15 @@ sub build_types {
 			)
 			(?:\s+$Modifier|\s+const)*
 		  }x;
+	$NonptrTypeWithAttr	= qr{
+			(?:$Modifier\s+|const\s+)*
+			(?:
+				(?:typeof|__typeof__)\s*\([^\)]*\)|
+				(?:$typeTypedefs\b)|
+				(?:${allWithAttr}\b)
+			)
+			(?:\s+$Modifier|\s+const)*
+		  }x;
 	$Type	= qr{
 			$NonptrType
 			(?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)?
@@ -3706,6 +3725,32 @@ sub process {
 			}
 		}
 
+sub string_find_replace {
+	my ($string, $find, $replace) = @_;
+
+	$string =~ s/$find/$replace/g;
+
+	return $string;
+}
+
+# check for bad placement of section $InitAttribute (e.g.: __initdata)
+		if ($line =~ /(\b$InitAttribute\b)/) {
+			my $attr = $1;
+			if ($line =~ /^\+\s*static\s+(?:const\s+)?(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*[=;]/) {
+				my $ptr = $1;
+				my $var = $2;
+				if ((($ptr =~ /\b(union|struct)\s+$attr\b/ &&
+				      ERROR("MISPLACED_INIT",
+					    "$attr should be placed after $var\n" . $herecurr)) ||
+				     ($ptr !~ /\b(union|struct)\s+$attr\b/ &&
+				      WARN("MISPLACED_INIT",
+					   "$attr should be placed after $var\n" . $herecurr))) &&
+				    $fix) {
+					$fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e;
+				}
+			}
+		}
+
 # prefer usleep_range over udelay
 		if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) {
 			# ignore udelay's < 10, however

From 91cf5ab60ff82ecf4550a596867787c1e360dd3f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 11 Sep 2013 14:24:06 -0700
Subject: [PATCH 197/303] epoll: add a reschedule point in ep_free()

ep_free() might iterate on a huge set of epitems and hold cpu too long.
Add two cond_resched() in order to yield cpu to other tasks.  This is safe
as we only hold mutexes in this function.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Theodore Ts'o <tytso@mit.edu>
Acked-by: Eric Wong <normalperson@yhbt.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 293f86741ddb..473e09da7d02 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -740,6 +740,7 @@ static void ep_free(struct eventpoll *ep)
 		epi = rb_entry(rbp, struct epitem, rbn);
 
 		ep_unregister_pollwait(ep, epi);
+		cond_resched();
 	}
 
 	/*
@@ -754,6 +755,7 @@ static void ep_free(struct eventpoll *ep)
 	while ((rbp = rb_first(&ep->rbr)) != NULL) {
 		epi = rb_entry(rbp, struct epitem, rbn);
 		ep_remove(ep, epi);
+		cond_resched();
 	}
 	mutex_unlock(&ep->mtx);
 

From 3d267f24d4c7bcc829ce9daa92e41c3f390c95dd Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Wed, 11 Sep 2013 14:24:07 -0700
Subject: [PATCH 198/303] firmware/dmi_scan: drop obsolete comment

This comment predates the introduction of early_ioremap.  Since then the
missing calls to dmi_iounmap have been added by Ingo and Yinghai in
commits 0d64484f7ea1 ("x86: fix DMI ioremap leak") and 3212bff370c2
("x86: left over fix for leak of early_ioremp in dmi_scan") .  That was
over 5 years ago so it is about time to drop this now misleading
comment.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Joe Perches <joe@perches.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/dmi_scan.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 232fa8fce26a..9e50cb997a42 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -504,11 +504,6 @@ void __init dmi_scan_machine(void)
 		}
 	}
 	else {
-		/*
-		 * no iounmap() for that ioremap(); it would be a no-op, but
-		 * it's so early in setup that sucker gets confused into doing
-		 * what it shouldn't if we actually call it.
-		 */
 		p = dmi_ioremap(0xF0000, 0x10000);
 		if (p == NULL)
 			goto error;

From 02d9c47f1bf2304d6482e1e69e00c06791d86908 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Wed, 11 Sep 2013 14:24:08 -0700
Subject: [PATCH 199/303] firmware/dmi_scan: fix most checkpatch errors and
 warnings

Fix all errors and trivial warnings reported by checkpatch for file
drivers/firmware/dmi_scan.c.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Joe Perches <joe@perches.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/dmi_scan.c | 47 +++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 23 deletions(-)

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 9e50cb997a42..5a5ca664f3e7 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -63,7 +63,7 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s)
 	if (str != NULL)
 		strcpy(str, bp);
 	else
-		printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len);
+		pr_err("dmi_string: cannot allocate %Zu bytes.\n", len);
 
 	return str;
 }
@@ -140,9 +140,10 @@ int dmi_available;
 /*
  *	Save a DMI string
  */
-static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int string)
+static void __init dmi_save_ident(const struct dmi_header *dm, int slot,
+		int string)
 {
-	const char *d = (const char*) dm;
+	const char *d = (const char *) dm;
 	char *p;
 
 	if (dmi_ident[slot])
@@ -155,9 +156,10 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int str
 	dmi_ident[slot] = p;
 }
 
-static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index)
+static void __init dmi_save_uuid(const struct dmi_header *dm, int slot,
+		int index)
 {
-	const u8 *d = (u8*) dm + index;
+	const u8 *d = (u8 *) dm + index;
 	char *s;
 	int is_ff = 1, is_00 = 1, i;
 
@@ -188,12 +190,13 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int inde
 	else
 		sprintf(s, "%pUB", d);
 
-        dmi_ident[slot] = s;
+	dmi_ident[slot] = s;
 }
 
-static void __init dmi_save_type(const struct dmi_header *dm, int slot, int index)
+static void __init dmi_save_type(const struct dmi_header *dm, int slot,
+		int index)
 {
-	const u8 *d = (u8*) dm + index;
+	const u8 *d = (u8 *) dm + index;
 	char *s;
 
 	if (dmi_ident[slot])
@@ -217,7 +220,7 @@ static void __init dmi_save_one_device(int type, const char *name)
 
 	dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1);
 	if (!dev) {
-		printk(KERN_ERR "dmi_save_one_device: out of memory.\n");
+		pr_err("dmi_save_one_device: out of memory.\n");
 		return;
 	}
 
@@ -256,8 +259,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
 
 		dev = dmi_alloc(sizeof(*dev));
 		if (!dev) {
-			printk(KERN_ERR
-			   "dmi_save_oem_strings_devices: out of memory.\n");
+			pr_err("dmi_save_oem_strings_devices: out of memory.\n");
 			break;
 		}
 
@@ -272,11 +274,11 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
 static void __init dmi_save_ipmi_device(const struct dmi_header *dm)
 {
 	struct dmi_device *dev;
-	void * data;
+	void *data;
 
 	data = dmi_alloc(dm->length);
 	if (data == NULL) {
-		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		pr_err("dmi_save_ipmi_device: out of memory.\n");
 		return;
 	}
 
@@ -284,7 +286,7 @@ static void __init dmi_save_ipmi_device(const struct dmi_header *dm)
 
 	dev = dmi_alloc(sizeof(*dev));
 	if (!dev) {
-		printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n");
+		pr_err("dmi_save_ipmi_device: out of memory.\n");
 		return;
 	}
 
@@ -302,7 +304,7 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
 
 	onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1);
 	if (!onboard_dev) {
-		printk(KERN_ERR "dmi_save_dev_onboard: out of memory.\n");
+		pr_err("dmi_save_dev_onboard: out of memory.\n");
 		return;
 	}
 	onboard_dev->instance = instance;
@@ -320,7 +322,7 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
 
 static void __init dmi_save_extended_devices(const struct dmi_header *dm)
 {
-	const u8 *d = (u8*) dm + 5;
+	const u8 *d = (u8 *) dm + 5;
 
 	/* Skip disabled device */
 	if ((*d & 0x80) == 0)
@@ -338,7 +340,7 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm)
  */
 static void __init dmi_decode(const struct dmi_header *dm, void *dummy)
 {
-	switch(dm->type) {
+	switch (dm->type) {
 	case 0:		/* BIOS Information */
 		dmi_save_ident(dm, DMI_BIOS_VENDOR, 4);
 		dmi_save_ident(dm, DMI_BIOS_VERSION, 5);
@@ -502,8 +504,7 @@ void __init dmi_scan_machine(void)
 			dmi_available = 1;
 			goto out;
 		}
-	}
-	else {
+	} else {
 		p = dmi_ioremap(0xF0000, 0x10000);
 		if (p == NULL)
 			goto error;
@@ -528,7 +529,7 @@ void __init dmi_scan_machine(void)
 		dmi_iounmap(p, 0x10000);
 	}
  error:
-	printk(KERN_INFO "DMI not present or invalid.\n");
+	pr_info("DMI not present or invalid.\n");
  out:
 	dmi_initialized = 1;
 }
@@ -664,7 +665,7 @@ int dmi_name_in_serial(const char *str)
 
 /**
  *	dmi_name_in_vendors - Check if string is in the DMI system or board vendor name
- *	@str: 	Case sensitive Name
+ *	@str: Case sensitive Name
  */
 int dmi_name_in_vendors(const char *str)
 {
@@ -691,13 +692,13 @@ EXPORT_SYMBOL(dmi_name_in_vendors);
  *	A new search is initiated by passing %NULL as the @from argument.
  *	If @from is not %NULL, searches continue from next device.
  */
-const struct dmi_device * dmi_find_device(int type, const char *name,
+const struct dmi_device *dmi_find_device(int type, const char *name,
 				    const struct dmi_device *from)
 {
 	const struct list_head *head = from ? &from->list : &dmi_devices;
 	struct list_head *d;
 
-	for(d = head->next; d != &dmi_devices; d = d->next) {
+	for (d = head->next; d != &dmi_devices; d = d->next) {
 		const struct dmi_device *dev =
 			list_entry(d, struct dmi_device, list);
 

From ffbbb96dd7570b9aafd426cd77a7ee03d224cabf Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Wed, 11 Sep 2013 14:24:09 -0700
Subject: [PATCH 200/303] firmware/dmi_scan: constify strings

Add const to all DMI string pointers where this is possible.  This fixes a
checkpatch warning.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Joe Perches <joe@perches.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/dmi_scan.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 5a5ca664f3e7..9a094bb44e3d 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -14,7 +14,7 @@
  * of and an antecedent to, SMBIOS, which stands for System
  * Management BIOS.  See further: http://www.dmtf.org/standards
  */
-static char dmi_empty_string[] = "        ";
+static const char dmi_empty_string[] = "        ";
 
 static u16 __initdata dmi_ver;
 /*
@@ -49,7 +49,7 @@ static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s)
 	return "";
 }
 
-static char * __init dmi_string(const struct dmi_header *dm, u8 s)
+static const char * __init dmi_string(const struct dmi_header *dm, u8 s)
 {
 	const char *bp = dmi_string_nosave(dm, s);
 	char *str;
@@ -133,7 +133,7 @@ static int __init dmi_checksum(const u8 *buf, u8 len)
 	return sum == 0;
 }
 
-static char *dmi_ident[DMI_STRING_MAX];
+static const char *dmi_ident[DMI_STRING_MAX];
 static LIST_HEAD(dmi_devices);
 int dmi_available;
 
@@ -144,7 +144,7 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot,
 		int string)
 {
 	const char *d = (const char *) dm;
-	char *p;
+	const char *p;
 
 	if (dmi_ident[slot])
 		return;
@@ -252,7 +252,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
 	struct dmi_device *dev;
 
 	for (i = 1; i <= count; i++) {
-		char *devname = dmi_string(dm, i);
+		const char *devname = dmi_string(dm, i);
 
 		if (devname == dmi_empty_string)
 			continue;

From ae79744975cb0b3b9c469fe1a05db37d2943c863 Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Wed, 11 Sep 2013 14:24:10 -0700
Subject: [PATCH 201/303] firmware/dmi_scan: drop OOM messages

As reported by Joe Perches: OOM messages generally aren't useful.
dmi_alloc is either a trivial front-end to kzalloc, and kzalloc already
does a dump_stack() when OOM, or for x86, dmi_alloc uses extend_brk
which BUGs when unsuccessful.

So we can remove all 6 such log messages in the dmi_scan driver, to
shrink the binary size (by 528 bytes on x86_64.)

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Reported-by: Joe Perches <joe@perches.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/firmware/dmi_scan.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 9a094bb44e3d..fa0affb699b4 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -62,8 +62,6 @@ static const char * __init dmi_string(const struct dmi_header *dm, u8 s)
 	str = dmi_alloc(len);
 	if (str != NULL)
 		strcpy(str, bp);
-	else
-		pr_err("dmi_string: cannot allocate %Zu bytes.\n", len);
 
 	return str;
 }
@@ -219,10 +217,8 @@ static void __init dmi_save_one_device(int type, const char *name)
 		return;
 
 	dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1);
-	if (!dev) {
-		pr_err("dmi_save_one_device: out of memory.\n");
+	if (!dev)
 		return;
-	}
 
 	dev->type = type;
 	strcpy((char *)(dev + 1), name);
@@ -258,10 +254,8 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm)
 			continue;
 
 		dev = dmi_alloc(sizeof(*dev));
-		if (!dev) {
-			pr_err("dmi_save_oem_strings_devices: out of memory.\n");
+		if (!dev)
 			break;
-		}
 
 		dev->type = DMI_DEV_TYPE_OEM_STRING;
 		dev->name = devname;
@@ -277,18 +271,14 @@ static void __init dmi_save_ipmi_device(const struct dmi_header *dm)
 	void *data;
 
 	data = dmi_alloc(dm->length);
-	if (data == NULL) {
-		pr_err("dmi_save_ipmi_device: out of memory.\n");
+	if (data == NULL)
 		return;
-	}
 
 	memcpy(data, dm, dm->length);
 
 	dev = dmi_alloc(sizeof(*dev));
-	if (!dev) {
-		pr_err("dmi_save_ipmi_device: out of memory.\n");
+	if (!dev)
 		return;
-	}
 
 	dev->type = DMI_DEV_TYPE_IPMI;
 	dev->name = "IPMI controller";
@@ -303,10 +293,9 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus,
 	struct dmi_dev_onboard *onboard_dev;
 
 	onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1);
-	if (!onboard_dev) {
-		pr_err("dmi_save_dev_onboard: out of memory.\n");
+	if (!onboard_dev)
 		return;
-	}
+
 	onboard_dev->instance = instance;
 	onboard_dev->segment = segment;
 	onboard_dev->bus = bus;

From c802d64a356b5cf349121ac4c5e005f037ce548d Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 11 Sep 2013 14:24:11 -0700
Subject: [PATCH 202/303] kprobes: unify insn caches

The current kpropes insn caches allocate memory areas for insn slots
with module_alloc().  The assumption is that the kernel image and module
area are both within the same +/- 2GB memory area.

This however is not true for s390 where the kernel image resides within
the first 2GB (DMA memory area), but the module area is far away in the
vmalloc area, usually somewhere close below the 4TB area.

For new pc relative instructions s390 needs insn slots that are within
+/- 2GB of each area.  That way we can patch displacements of
pc-relative instructions within the insn slots just like x86 and
powerpc.

The module area works already with the normal insn slot allocator,
however there is currently no way to get insn slots that are within the
first 2GB on s390 (aka DMA area).

Therefore this patch set modifies the kprobes insn slot cache code in
order to allow to specify a custom allocator for the insn slot cache
pages.  In addition architecure can now have private insn slot caches
withhout the need to modify common code.

Patch 1 unifies and simplifies the current insn and optinsn caches
        implementation. This is a preparation which allows to add more
        insn caches in a simple way.

Patch 2 adds the possibility to specify a custom allocator.

Patch 3 makes s390 use the new insn slot mechanisms and adds support for
        pc-relative instructions with long displacements.

This patch (of 3):

The two insn caches (insn, and optinsn) each have an own mutex and
alloc/free functions (get_[opt]insn_slot() / free_[opt]insn_slot()).

Since there is the need for yet another insn cache which satifies dma
allocations on s390, unify and simplify the current implementation:

- Move the per insn cache mutex into struct kprobe_insn_cache.
- Move the alloc/free functions to kprobe.h so they are simply
  wrappers for the generic __get_insn_slot/__free_insn_slot functions.
  The implementation is done with a DEFINE_INSN_CACHE_OPS() macro
  which provides the alloc/free functions for each cache if needed.
- move the struct kprobe_insn_cache to kprobe.h which allows to generate
  architecture specific insn slot caches outside of the core kprobes
  code.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h | 32 +++++++++++++++---
 kernel/kprobes.c        | 75 ++++++++++++-----------------------------
 2 files changed, 49 insertions(+), 58 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index ca1d27a0d6a6..077f65321b5e 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -264,10 +264,34 @@ extern void arch_arm_kprobe(struct kprobe *p);
 extern void arch_disarm_kprobe(struct kprobe *p);
 extern int arch_init_kprobes(void);
 extern void show_registers(struct pt_regs *regs);
-extern kprobe_opcode_t *get_insn_slot(void);
-extern void free_insn_slot(kprobe_opcode_t *slot, int dirty);
 extern void kprobes_inc_nmissed_count(struct kprobe *p);
 
+struct kprobe_insn_cache {
+	struct mutex mutex;
+	struct list_head pages; /* list of kprobe_insn_page */
+	size_t insn_size;	/* size of instruction slot */
+	int nr_garbage;
+};
+
+extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c);
+extern void __free_insn_slot(struct kprobe_insn_cache *c,
+			     kprobe_opcode_t *slot, int dirty);
+
+#define DEFINE_INSN_CACHE_OPS(__name)					\
+extern struct kprobe_insn_cache kprobe_##__name##_slots;		\
+									\
+static inline kprobe_opcode_t *get_##__name##_slot(void)		\
+{									\
+	return __get_insn_slot(&kprobe_##__name##_slots);		\
+}									\
+									\
+static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\
+{									\
+	__free_insn_slot(&kprobe_##__name##_slots, slot, dirty);	\
+}									\
+
+DEFINE_INSN_CACHE_OPS(insn);
+
 #ifdef CONFIG_OPTPROBES
 /*
  * Internal structure for direct jump optimized probe
@@ -287,13 +311,13 @@ extern void arch_optimize_kprobes(struct list_head *oplist);
 extern void arch_unoptimize_kprobes(struct list_head *oplist,
 				    struct list_head *done_list);
 extern void arch_unoptimize_kprobe(struct optimized_kprobe *op);
-extern kprobe_opcode_t *get_optinsn_slot(void);
-extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty);
 extern int arch_within_optimized_kprobe(struct optimized_kprobe *op,
 					unsigned long addr);
 
 extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs);
 
+DEFINE_INSN_CACHE_OPS(optinsn);
+
 #ifdef CONFIG_SYSCTL
 extern int sysctl_kprobes_optimization;
 extern int proc_kprobes_optimization_handler(struct ctl_table *table,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6e33498d665c..9e4912dc5559 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -121,12 +121,6 @@ struct kprobe_insn_page {
 	(offsetof(struct kprobe_insn_page, slot_used) +	\
 	 (sizeof(char) * (slots)))
 
-struct kprobe_insn_cache {
-	struct list_head pages;	/* list of kprobe_insn_page */
-	size_t insn_size;	/* size of instruction slot */
-	int nr_garbage;
-};
-
 static int slots_per_page(struct kprobe_insn_cache *c)
 {
 	return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t));
@@ -138,8 +132,8 @@ enum kprobe_slot_state {
 	SLOT_USED = 2,
 };
 
-static DEFINE_MUTEX(kprobe_insn_mutex);	/* Protects kprobe_insn_slots */
-static struct kprobe_insn_cache kprobe_insn_slots = {
+struct kprobe_insn_cache kprobe_insn_slots = {
+	.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
 	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
 	.insn_size = MAX_INSN_SIZE,
 	.nr_garbage = 0,
@@ -150,10 +144,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
  * __get_insn_slot() - Find a slot on an executable page for an instruction.
  * We allocate an executable page if there's no room on existing ones.
  */
-static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 {
 	struct kprobe_insn_page *kip;
+	kprobe_opcode_t *slot = NULL;
 
+	mutex_lock(&c->mutex);
  retry:
 	list_for_each_entry(kip, &c->pages, list) {
 		if (kip->nused < slots_per_page(c)) {
@@ -162,7 +158,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 				if (kip->slot_used[i] == SLOT_CLEAN) {
 					kip->slot_used[i] = SLOT_USED;
 					kip->nused++;
-					return kip->insns + (i * c->insn_size);
+					slot = kip->insns + (i * c->insn_size);
+					goto out;
 				}
 			}
 			/* kip->nused is broken. Fix it. */
@@ -178,7 +175,7 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 	/* All out of space.  Need to allocate a new page. */
 	kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL);
 	if (!kip)
-		return NULL;
+		goto out;
 
 	/*
 	 * Use module_alloc so this page is within +/- 2GB of where the
@@ -188,7 +185,7 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->insns = module_alloc(PAGE_SIZE);
 	if (!kip->insns) {
 		kfree(kip);
-		return NULL;
+		goto out;
 	}
 	INIT_LIST_HEAD(&kip->list);
 	memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c));
@@ -196,19 +193,10 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->nused = 1;
 	kip->ngarbage = 0;
 	list_add(&kip->list, &c->pages);
-	return kip->insns;
-}
-
-
-kprobe_opcode_t __kprobes *get_insn_slot(void)
-{
-	kprobe_opcode_t *ret = NULL;
-
-	mutex_lock(&kprobe_insn_mutex);
-	ret = __get_insn_slot(&kprobe_insn_slots);
-	mutex_unlock(&kprobe_insn_mutex);
-
-	return ret;
+	slot = kip->insns;
+out:
+	mutex_unlock(&c->mutex);
+	return slot;
 }
 
 /* Return 1 if all garbages are collected, otherwise 0. */
@@ -255,11 +243,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
 	return 0;
 }
 
-static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
-				       kprobe_opcode_t *slot, int dirty)
+void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
+				kprobe_opcode_t *slot, int dirty)
 {
 	struct kprobe_insn_page *kip;
 
+	mutex_lock(&c->mutex);
 	list_for_each_entry(kip, &c->pages, list) {
 		long idx = ((long)slot - (long)kip->insns) /
 				(c->insn_size * sizeof(kprobe_opcode_t));
@@ -272,45 +261,23 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
 					collect_garbage_slots(c);
 			} else
 				collect_one_slot(kip, idx);
-			return;
+			goto out;
 		}
 	}
 	/* Could not free this slot. */
 	WARN_ON(1);
+out:
+	mutex_unlock(&c->mutex);
 }
 
-void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty)
-{
-	mutex_lock(&kprobe_insn_mutex);
-	__free_insn_slot(&kprobe_insn_slots, slot, dirty);
-	mutex_unlock(&kprobe_insn_mutex);
-}
 #ifdef CONFIG_OPTPROBES
 /* For optimized_kprobe buffer */
-static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */
-static struct kprobe_insn_cache kprobe_optinsn_slots = {
+struct kprobe_insn_cache kprobe_optinsn_slots = {
+	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
 	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
 	/* .insn_size is initialized later */
 	.nr_garbage = 0,
 };
-/* Get a slot for optimized_kprobe buffer */
-kprobe_opcode_t __kprobes *get_optinsn_slot(void)
-{
-	kprobe_opcode_t *ret = NULL;
-
-	mutex_lock(&kprobe_optinsn_mutex);
-	ret = __get_insn_slot(&kprobe_optinsn_slots);
-	mutex_unlock(&kprobe_optinsn_mutex);
-
-	return ret;
-}
-
-void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty)
-{
-	mutex_lock(&kprobe_optinsn_mutex);
-	__free_insn_slot(&kprobe_optinsn_slots, slot, dirty);
-	mutex_unlock(&kprobe_optinsn_mutex);
-}
 #endif
 #endif
 

From af96397de8600232effbff43dc8b4ca20ddc02b1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 11 Sep 2013 14:24:13 -0700
Subject: [PATCH 203/303] kprobes: allow to specify custom allocator for insn
 caches

The current two insn slot caches both use module_alloc/module_free to
allocate and free insn slot cache pages.

For s390 this is not sufficient since there is the need to allocate insn
slots that are either within the vmalloc module area or within dma memory.

Therefore add a mechanism which allows to specify an own allocator for an
own insn slot cache.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kprobes.h |  2 ++
 kernel/kprobes.c        | 20 ++++++++++++++++++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 077f65321b5e..925eaf28fca9 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -268,6 +268,8 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p);
 
 struct kprobe_insn_cache {
 	struct mutex mutex;
+	void *(*alloc)(void);	/* allocate insn page */
+	void (*free)(void *);	/* free insn page */
 	struct list_head pages; /* list of kprobe_insn_page */
 	size_t insn_size;	/* size of instruction slot */
 	int nr_garbage;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 9e4912dc5559..a0d367a49122 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
 struct kprobe_insn_page {
 	struct list_head list;
 	kprobe_opcode_t *insns;		/* Page of instruction slots */
+	struct kprobe_insn_cache *cache;
 	int nused;
 	int ngarbage;
 	char slot_used[];
@@ -132,8 +133,20 @@ enum kprobe_slot_state {
 	SLOT_USED = 2,
 };
 
+static void *alloc_insn_page(void)
+{
+	return module_alloc(PAGE_SIZE);
+}
+
+static void free_insn_page(void *page)
+{
+	module_free(NULL, page);
+}
+
 struct kprobe_insn_cache kprobe_insn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex),
+	.alloc = alloc_insn_page,
+	.free = free_insn_page,
 	.pages = LIST_HEAD_INIT(kprobe_insn_slots.pages),
 	.insn_size = MAX_INSN_SIZE,
 	.nr_garbage = 0,
@@ -182,7 +195,7 @@ kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 	 * kernel image and loaded module images reside. This is required
 	 * so x86_64 can correctly handle the %rip-relative fixups.
 	 */
-	kip->insns = module_alloc(PAGE_SIZE);
+	kip->insns = c->alloc();
 	if (!kip->insns) {
 		kfree(kip);
 		goto out;
@@ -192,6 +205,7 @@ kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
 	kip->slot_used[0] = SLOT_USED;
 	kip->nused = 1;
 	kip->ngarbage = 0;
+	kip->cache = c;
 	list_add(&kip->list, &c->pages);
 	slot = kip->insns;
 out:
@@ -213,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
 		 */
 		if (!list_is_singular(&kip->list)) {
 			list_del(&kip->list);
-			module_free(NULL, kip->insns);
+			kip->cache->free(kip->insns);
 			kfree(kip);
 		}
 		return 1;
@@ -274,6 +288,8 @@ out:
 /* For optimized_kprobe buffer */
 struct kprobe_insn_cache kprobe_optinsn_slots = {
 	.mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex),
+	.alloc = alloc_insn_page,
+	.free = free_insn_page,
 	.pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages),
 	/* .insn_size is initialized later */
 	.nr_garbage = 0,

From 63c40436a1afc837f3ace6b5a39c547bc91c20bc Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 11 Sep 2013 14:24:14 -0700
Subject: [PATCH 204/303] s390/kprobes: add support for pc-relative long
 displacement instructions

With the general-instruction extension facility (z10) a couple of
instructions with a pc-relative long displacement were introduced.  The
kprobes support for these instructions however was never implemented.

In result, if anybody ever put a probe on any of these instructions the
result would have been random behaviour after the instruction got executed
within the insn slot.

So lets add the missing handling for these instructions.  Since all of the
new instructions have 32 bit signed displacement the easiest solution is
to allocate an insn slot that is within the same 2GB area like the
original instruction and patch the displacement field.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/include/asm/kprobes.h |   4 +-
 arch/s390/kernel/kprobes.c      | 144 ++++++++++++++++++++++++++++++--
 2 files changed, 140 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h
index dcf6948a875c..4176dfe0fba1 100644
--- a/arch/s390/include/asm/kprobes.h
+++ b/arch/s390/include/asm/kprobes.h
@@ -31,6 +31,8 @@
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
 
+#define __ARCH_WANT_KPROBES_INSN_SLOT
+
 struct pt_regs;
 struct kprobe;
 
@@ -57,7 +59,7 @@ typedef u16 kprobe_opcode_t;
 /* Architecture specific copy of original instruction */
 struct arch_specific_insn {
 	/* copy of original instruction */
-	kprobe_opcode_t insn[MAX_INSN_SIZE];
+	kprobe_opcode_t *insn;
 };
 
 struct prev_kprobe {
diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c
index adbbe7f1cb0d..0ce9fb245034 100644
--- a/arch/s390/kernel/kprobes.c
+++ b/arch/s390/kernel/kprobes.c
@@ -37,6 +37,26 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
 
 struct kretprobe_blackpoint kretprobe_blacklist[] = { };
 
+DEFINE_INSN_CACHE_OPS(dmainsn);
+
+static void *alloc_dmainsn_page(void)
+{
+	return (void *)__get_free_page(GFP_KERNEL | GFP_DMA);
+}
+
+static void free_dmainsn_page(void *page)
+{
+	free_page((unsigned long)page);
+}
+
+struct kprobe_insn_cache kprobe_dmainsn_slots = {
+	.mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex),
+	.alloc = alloc_dmainsn_page,
+	.free = free_dmainsn_page,
+	.pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages),
+	.insn_size = MAX_INSN_SIZE,
+};
+
 static int __kprobes is_prohibited_opcode(kprobe_opcode_t *insn)
 {
 	switch (insn[0] >> 8) {
@@ -100,9 +120,8 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn)
 			fixup |= FIXUP_RETURN_REGISTER;
 		break;
 	case 0xc0:
-		if ((insn[0] & 0x0f) == 0x00 ||	/* larl  */
-		    (insn[0] & 0x0f) == 0x05)	/* brasl */
-		fixup |= FIXUP_RETURN_REGISTER;
+		if ((insn[0] & 0x0f) == 0x05)	/* brasl */
+			fixup |= FIXUP_RETURN_REGISTER;
 		break;
 	case 0xeb:
 		switch (insn[2] & 0xff) {
@@ -134,18 +153,128 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn)
 	return fixup;
 }
 
+static int __kprobes is_insn_relative_long(kprobe_opcode_t *insn)
+{
+	/* Check if we have a RIL-b or RIL-c format instruction which
+	 * we need to modify in order to avoid instruction emulation. */
+	switch (insn[0] >> 8) {
+	case 0xc0:
+		if ((insn[0] & 0x0f) == 0x00) /* larl */
+			return true;
+		break;
+	case 0xc4:
+		switch (insn[0] & 0x0f) {
+		case 0x02: /* llhrl  */
+		case 0x04: /* lghrl  */
+		case 0x05: /* lhrl   */
+		case 0x06: /* llghrl */
+		case 0x07: /* sthrl  */
+		case 0x08: /* lgrl   */
+		case 0x0b: /* stgrl  */
+		case 0x0c: /* lgfrl  */
+		case 0x0d: /* lrl    */
+		case 0x0e: /* llgfrl */
+		case 0x0f: /* strl   */
+			return true;
+		}
+		break;
+	case 0xc6:
+		switch (insn[0] & 0x0f) {
+		case 0x00: /* exrl   */
+		case 0x02: /* pfdrl  */
+		case 0x04: /* cghrl  */
+		case 0x05: /* chrl   */
+		case 0x06: /* clghrl */
+		case 0x07: /* clhrl  */
+		case 0x08: /* cgrl   */
+		case 0x0a: /* clgrl  */
+		case 0x0c: /* cgfrl  */
+		case 0x0d: /* crl    */
+		case 0x0e: /* clgfrl */
+		case 0x0f: /* clrl   */
+			return true;
+		}
+		break;
+	}
+	return false;
+}
+
+static void __kprobes copy_instruction(struct kprobe *p)
+{
+	s64 disp, new_disp;
+	u64 addr, new_addr;
+
+	memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2);
+	if (!is_insn_relative_long(p->ainsn.insn))
+		return;
+	/*
+	 * For pc-relative instructions in RIL-b or RIL-c format patch the
+	 * RI2 displacement field. We have already made sure that the insn
+	 * slot for the patched instruction is within the same 2GB area
+	 * as the original instruction (either kernel image or module area).
+	 * Therefore the new displacement will always fit.
+	 */
+	disp = *(s32 *)&p->ainsn.insn[1];
+	addr = (u64)(unsigned long)p->addr;
+	new_addr = (u64)(unsigned long)p->ainsn.insn;
+	new_disp = ((addr + (disp * 2)) - new_addr) / 2;
+	*(s32 *)&p->ainsn.insn[1] = new_disp;
+}
+
+static inline int is_kernel_addr(void *addr)
+{
+	return addr < (void *)_end;
+}
+
+static inline int is_module_addr(void *addr)
+{
+#ifdef CONFIG_64BIT
+	BUILD_BUG_ON(MODULES_LEN > (1UL << 31));
+	if (addr < (void *)MODULES_VADDR)
+		return 0;
+	if (addr > (void *)MODULES_END)
+		return 0;
+#endif
+	return 1;
+}
+
+static int __kprobes s390_get_insn_slot(struct kprobe *p)
+{
+	/*
+	 * Get an insn slot that is within the same 2GB area like the original
+	 * instruction. That way instructions with a 32bit signed displacement
+	 * field can be patched and executed within the insn slot.
+	 */
+	p->ainsn.insn = NULL;
+	if (is_kernel_addr(p->addr))
+		p->ainsn.insn = get_dmainsn_slot();
+	if (is_module_addr(p->addr))
+		p->ainsn.insn = get_insn_slot();
+	return p->ainsn.insn ? 0 : -ENOMEM;
+}
+
+static void __kprobes s390_free_insn_slot(struct kprobe *p)
+{
+	if (!p->ainsn.insn)
+		return;
+	if (is_kernel_addr(p->addr))
+		free_dmainsn_slot(p->ainsn.insn, 0);
+	else
+		free_insn_slot(p->ainsn.insn, 0);
+	p->ainsn.insn = NULL;
+}
+
 int __kprobes arch_prepare_kprobe(struct kprobe *p)
 {
 	if ((unsigned long) p->addr & 0x01)
 		return -EINVAL;
-
 	/* Make sure the probe isn't going on a difficult instruction */
 	if (is_prohibited_opcode(p->addr))
 		return -EINVAL;
-
+	if (s390_get_insn_slot(p))
+		return -ENOMEM;
 	p->opcode = *p->addr;
-	memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2);
-
+	copy_instruction(p);
 	return 0;
 }
 
@@ -186,6 +315,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p)
 
 void __kprobes arch_remove_kprobe(struct kprobe *p)
 {
+	s390_free_insn_slot(p);
 }
 
 static void __kprobes enable_singlestep(struct kprobe_ctlblk *kcb,

From ae259925bc8e4880e861b29d166000240dde93de Mon Sep 17 00:00:00 2001
From: Alexander Holler <holler@ahsoftware.de>
Date: Wed, 11 Sep 2013 14:24:15 -0700
Subject: [PATCH 205/303] drivers/rtc/rtc-hid-sensor-time.c: add module alias
 to let the module load automatically

In order to get the module automatically loaded by hotplug mechanisms a
MODULE_DEVICE_TABLE is needed.

Therefore add one.

This makes it also possible to use a module name other than
HID-SENSOR-2000a0 which isn't very descriptive in kernel messages.

Signed-off-by: Alexander Holler <holler@ahsoftware.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-hid-sensor-time.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c
index 7273b0139e5c..b5a2874b15ef 100644
--- a/drivers/rtc/rtc-hid-sensor-time.c
+++ b/drivers/rtc/rtc-hid-sensor-time.c
@@ -23,10 +23,6 @@
 #include <linux/iio/iio.h>
 #include <linux/rtc.h>
 
-/* Format: HID-SENSOR-usage_id_in_hex */
-/* Usage ID from spec for Time: 0x2000A0 */
-#define DRIVER_NAME "HID-SENSOR-2000a0" /* must be lowercase */
-
 enum hid_time_channel {
 	CHANNEL_SCAN_INDEX_YEAR,
 	CHANNEL_SCAN_INDEX_MONTH,
@@ -300,9 +296,19 @@ static int hid_time_remove(struct platform_device *pdev)
 	return 0;
 }
 
+static struct platform_device_id hid_time_ids[] = {
+	{
+		/* Format: HID-SENSOR-usage_id_in_hex_lowercase */
+		.name = "HID-SENSOR-2000a0",
+	},
+	{ /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(platform, hid_time_ids);
+
 static struct platform_driver hid_time_platform_driver = {
+	.id_table = hid_time_ids,
 	.driver = {
-		.name	= DRIVER_NAME,
+		.name	= KBUILD_MODNAME,
 		.owner	= THIS_MODULE,
 	},
 	.probe		= hid_time_probe,

From 4540bae9cd8c03ec4a26eacad468d2f2f2e85cbb Mon Sep 17 00:00:00 2001
From: Sachin Kamat <sachin.kamat@linaro.org>
Date: Wed, 11 Sep 2013 14:24:16 -0700
Subject: [PATCH 206/303] drivers/rtc/rtc-pcf2127.c: remove empty function

The 'remove' function is empty and does not do anything.  Delete it.

Signed-off-by: Sachin Kamat <sachin.kamat@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-pcf2127.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c
index 205b9f7da1b8..1ee514a3972c 100644
--- a/drivers/rtc/rtc-pcf2127.c
+++ b/drivers/rtc/rtc-pcf2127.c
@@ -203,11 +203,6 @@ static int pcf2127_probe(struct i2c_client *client,
 	return 0;
 }
 
-static int pcf2127_remove(struct i2c_client *client)
-{
-	return 0;
-}
-
 static const struct i2c_device_id pcf2127_id[] = {
 	{ "pcf2127", 0 },
 	{ }
@@ -229,7 +224,6 @@ static struct i2c_driver pcf2127_driver = {
 		.of_match_table = of_match_ptr(pcf2127_of_match),
 	},
 	.probe		= pcf2127_probe,
-	.remove		= pcf2127_remove,
 	.id_table	= pcf2127_id,
 };
 

From 453b4c6db59f7f6411a0b5eb58389a1fa129cc9a Mon Sep 17 00:00:00 2001
From: Jonas Jensen <jonas.jensen@gmail.com>
Date: Wed, 11 Sep 2013 14:24:17 -0700
Subject: [PATCH 207/303] rtc: add MOXA ART RTC driver

Add RTC driver for MOXA ART SoCs.

Signed-off-by: Jonas Jensen <jonas.jensen@gmail.com>
Reviewed-by: Mark Brown <broonie@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../bindings/rtc/moxa,moxart-rtc.txt          |  17 +
 drivers/rtc/Kconfig                           |   9 +
 drivers/rtc/Makefile                          |   1 +
 drivers/rtc/rtc-moxart.c                      | 330 ++++++++++++++++++
 4 files changed, 357 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt
 create mode 100644 drivers/rtc/rtc-moxart.c

diff --git a/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt
new file mode 100644
index 000000000000..c9d3ac1477fe
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt
@@ -0,0 +1,17 @@
+MOXA ART real-time clock
+
+Required properties:
+
+- compatible : Should be "moxa,moxart-rtc"
+- gpio-rtc-sclk : RTC sclk gpio, with zero flags
+- gpio-rtc-data : RTC data gpio, with zero flags
+- gpio-rtc-reset : RTC reset gpio, with zero flags
+
+Example:
+
+	rtc: rtc {
+		compatible = "moxa,moxart-rtc";
+		gpio-rtc-sclk = <&gpio 5 0>;
+		gpio-rtc-data = <&gpio 6 0>;
+		gpio-rtc-reset = <&gpio 7 0>;
+	};
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 9e3498bf302b..9654aa3c05cb 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -1249,6 +1249,15 @@ config RTC_DRV_SIRFSOC
 	  Say "yes" here to support the real time clock on SiRF SOC chips.
 	  This driver can also be built as a module called rtc-sirfsoc.
 
+config RTC_DRV_MOXART
+	tristate "MOXA ART RTC"
+	help
+	   If you say yes here you get support for the MOXA ART
+	   RTC module.
+
+	   This driver can also be built as a module. If so, the module
+	   will be called rtc-moxart
+
 comment "HID Sensor RTC drivers"
 
 config RTC_DRV_HID_SENSOR_TIME
diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index d3b4488f48f2..2dff3d2009b5 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -130,3 +130,4 @@ obj-$(CONFIG_RTC_DRV_WM831X)	+= rtc-wm831x.o
 obj-$(CONFIG_RTC_DRV_WM8350)	+= rtc-wm8350.o
 obj-$(CONFIG_RTC_DRV_X1205)	+= rtc-x1205.o
 obj-$(CONFIG_RTC_DRV_SIRFSOC)	+= rtc-sirfsoc.o
+obj-$(CONFIG_RTC_DRV_MOXART)	+= rtc-moxart.o
diff --git a/drivers/rtc/rtc-moxart.c b/drivers/rtc/rtc-moxart.c
new file mode 100644
index 000000000000..c29dee0946e6
--- /dev/null
+++ b/drivers/rtc/rtc-moxart.c
@@ -0,0 +1,330 @@
+/*
+ * MOXA ART RTC driver.
+ *
+ * Copyright (C) 2013 Jonas Jensen
+ *
+ * Jonas Jensen <jonas.jensen@gmail.com>
+ *
+ * Based on code from
+ * Moxa Technology Co., Ltd. <www.moxa.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/rtc.h>
+#include <linux/platform_device.h>
+#include <linux/module.h>
+#include <linux/gpio.h>
+#include <linux/of_gpio.h>
+
+#define GPIO_RTC_RESERVED			0x0C
+#define GPIO_RTC_DATA_SET			0x10
+#define GPIO_RTC_DATA_CLEAR			0x14
+#define GPIO_RTC_PIN_PULL_ENABLE		0x18
+#define GPIO_RTC_PIN_PULL_TYPE			0x1C
+#define GPIO_RTC_INT_ENABLE			0x20
+#define GPIO_RTC_INT_RAW_STATE			0x24
+#define GPIO_RTC_INT_MASKED_STATE		0x28
+#define GPIO_RTC_INT_MASK			0x2C
+#define GPIO_RTC_INT_CLEAR			0x30
+#define GPIO_RTC_INT_TRIGGER			0x34
+#define GPIO_RTC_INT_BOTH			0x38
+#define GPIO_RTC_INT_RISE_NEG			0x3C
+#define GPIO_RTC_BOUNCE_ENABLE			0x40
+#define GPIO_RTC_BOUNCE_PRE_SCALE		0x44
+#define GPIO_RTC_PROTECT_W			0x8E
+#define GPIO_RTC_PROTECT_R			0x8F
+#define GPIO_RTC_YEAR_W				0x8C
+#define GPIO_RTC_YEAR_R				0x8D
+#define GPIO_RTC_DAY_W				0x8A
+#define GPIO_RTC_DAY_R				0x8B
+#define GPIO_RTC_MONTH_W			0x88
+#define GPIO_RTC_MONTH_R			0x89
+#define GPIO_RTC_DATE_W				0x86
+#define GPIO_RTC_DATE_R				0x87
+#define GPIO_RTC_HOURS_W			0x84
+#define GPIO_RTC_HOURS_R			0x85
+#define GPIO_RTC_MINUTES_W			0x82
+#define GPIO_RTC_MINUTES_R			0x83
+#define GPIO_RTC_SECONDS_W			0x80
+#define GPIO_RTC_SECONDS_R			0x81
+#define GPIO_RTC_DELAY_TIME			8
+
+struct moxart_rtc {
+	struct rtc_device *rtc;
+	spinlock_t rtc_lock;
+	int gpio_data, gpio_sclk, gpio_reset;
+};
+
+static int day_of_year[12] =	{ 0, 31, 59, 90, 120, 151, 181,
+				  212, 243, 273, 304, 334 };
+
+static void moxart_rtc_write_byte(struct device *dev, u8 data)
+{
+	struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
+	int i;
+
+	for (i = 0; i < 8; i++, data >>= 1) {
+		gpio_set_value(moxart_rtc->gpio_sclk, 0);
+		gpio_set_value(moxart_rtc->gpio_data, ((data & 1) == 1));
+		udelay(GPIO_RTC_DELAY_TIME);
+		gpio_set_value(moxart_rtc->gpio_sclk, 1);
+		udelay(GPIO_RTC_DELAY_TIME);
+	}
+}
+
+static u8 moxart_rtc_read_byte(struct device *dev)
+{
+	struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
+	int i;
+	u8 data = 0;
+
+	for (i = 0; i < 8; i++) {
+		gpio_set_value(moxart_rtc->gpio_sclk, 0);
+		udelay(GPIO_RTC_DELAY_TIME);
+		gpio_set_value(moxart_rtc->gpio_sclk, 1);
+		udelay(GPIO_RTC_DELAY_TIME);
+		if (gpio_get_value(moxart_rtc->gpio_data))
+			data |= (1 << i);
+		udelay(GPIO_RTC_DELAY_TIME);
+	}
+	return data;
+}
+
+static u8 moxart_rtc_read_register(struct device *dev, u8 cmd)
+{
+	struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
+	u8 data;
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	gpio_direction_output(moxart_rtc->gpio_data, 0);
+	gpio_set_value(moxart_rtc->gpio_reset, 1);
+	udelay(GPIO_RTC_DELAY_TIME);
+	moxart_rtc_write_byte(dev, cmd);
+	gpio_direction_input(moxart_rtc->gpio_data);
+	udelay(GPIO_RTC_DELAY_TIME);
+	data = moxart_rtc_read_byte(dev);
+	gpio_set_value(moxart_rtc->gpio_sclk, 0);
+	gpio_set_value(moxart_rtc->gpio_reset, 0);
+	udelay(GPIO_RTC_DELAY_TIME);
+
+	local_irq_restore(flags);
+
+	return data;
+}
+
+static void moxart_rtc_write_register(struct device *dev, u8 cmd, u8 data)
+{
+	struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	gpio_direction_output(moxart_rtc->gpio_data, 0);
+	gpio_set_value(moxart_rtc->gpio_reset, 1);
+	udelay(GPIO_RTC_DELAY_TIME);
+	moxart_rtc_write_byte(dev, cmd);
+	moxart_rtc_write_byte(dev, data);
+	gpio_set_value(moxart_rtc->gpio_sclk, 0);
+	gpio_set_value(moxart_rtc->gpio_reset, 0);
+	udelay(GPIO_RTC_DELAY_TIME);
+
+	local_irq_restore(flags);
+}
+
+static int moxart_rtc_set_time(struct device *dev, struct rtc_time *tm)
+{
+	struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
+
+	spin_lock_irq(&moxart_rtc->rtc_lock);
+
+	moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0);
+	moxart_rtc_write_register(dev, GPIO_RTC_YEAR_W,
+				  (((tm->tm_year - 100) / 10) << 4) |
+				  ((tm->tm_year - 100) % 10));
+
+	moxart_rtc_write_register(dev, GPIO_RTC_MONTH_W,
+				  (((tm->tm_mon + 1) / 10) << 4) |
+				  ((tm->tm_mon + 1) % 10));
+
+	moxart_rtc_write_register(dev, GPIO_RTC_DATE_W,
+				  ((tm->tm_mday / 10) << 4) |
+				  (tm->tm_mday % 10));
+
+	moxart_rtc_write_register(dev, GPIO_RTC_HOURS_W,
+				  ((tm->tm_hour / 10) << 4) |
+				  (tm->tm_hour % 10));
+
+	moxart_rtc_write_register(dev, GPIO_RTC_MINUTES_W,
+				  ((tm->tm_min / 10) << 4) |
+				  (tm->tm_min % 10));
+
+	moxart_rtc_write_register(dev, GPIO_RTC_SECONDS_W,
+				  ((tm->tm_sec / 10) << 4) |
+				  (tm->tm_sec % 10));
+
+	moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0x80);
+
+	spin_unlock_irq(&moxart_rtc->rtc_lock);
+
+	dev_dbg(dev, "%s: success tm_year=%d tm_mon=%d\n"
+		"tm_mday=%d tm_hour=%d tm_min=%d tm_sec=%d\n",
+		__func__, tm->tm_year, tm->tm_mon, tm->tm_mday,
+		tm->tm_hour, tm->tm_min, tm->tm_sec);
+
+	return 0;
+}
+
+static int moxart_rtc_read_time(struct device *dev, struct rtc_time *tm)
+{
+	struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev);
+	unsigned char v;
+
+	spin_lock_irq(&moxart_rtc->rtc_lock);
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_SECONDS_R);
+	tm->tm_sec = (((v & 0x70) >> 4) * 10) + (v & 0x0F);
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_MINUTES_R);
+	tm->tm_min = (((v & 0x70) >> 4) * 10) + (v & 0x0F);
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_HOURS_R);
+	if (v & 0x80) { /* 12-hour mode */
+		tm->tm_hour = (((v & 0x10) >> 4) * 10) + (v & 0x0F);
+		if (v & 0x20) { /* PM mode */
+			tm->tm_hour += 12;
+			if (tm->tm_hour >= 24)
+				tm->tm_hour = 0;
+		}
+	} else { /* 24-hour mode */
+		tm->tm_hour = (((v & 0x30) >> 4) * 10) + (v & 0x0F);
+	}
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_DATE_R);
+	tm->tm_mday = (((v & 0x30) >> 4) * 10) + (v & 0x0F);
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_MONTH_R);
+	tm->tm_mon = (((v & 0x10) >> 4) * 10) + (v & 0x0F);
+	tm->tm_mon--;
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_YEAR_R);
+	tm->tm_year = (((v & 0xF0) >> 4) * 10) + (v & 0x0F);
+	tm->tm_year += 100;
+	if (tm->tm_year <= 69)
+		tm->tm_year += 100;
+
+	v = moxart_rtc_read_register(dev, GPIO_RTC_DAY_R);
+	tm->tm_wday = (v & 0x0f) - 1;
+	tm->tm_yday = day_of_year[tm->tm_mon];
+	tm->tm_yday += (tm->tm_mday - 1);
+	if (tm->tm_mon >= 2) {
+		if (!(tm->tm_year % 4) && (tm->tm_year % 100))
+			tm->tm_yday++;
+	}
+
+	tm->tm_isdst = 0;
+
+	spin_unlock_irq(&moxart_rtc->rtc_lock);
+
+	return 0;
+}
+
+static const struct rtc_class_ops moxart_rtc_ops = {
+	.read_time	= moxart_rtc_read_time,
+	.set_time	= moxart_rtc_set_time,
+};
+
+static int moxart_rtc_probe(struct platform_device *pdev)
+{
+	struct moxart_rtc *moxart_rtc;
+	int ret = 0;
+
+	moxart_rtc = devm_kzalloc(&pdev->dev, sizeof(*moxart_rtc), GFP_KERNEL);
+	if (!moxart_rtc) {
+		dev_err(&pdev->dev, "devm_kzalloc failed\n");
+		return -ENOMEM;
+	}
+
+	moxart_rtc->gpio_data = of_get_named_gpio(pdev->dev.of_node,
+						  "gpio-rtc-data", 0);
+	if (!gpio_is_valid(moxart_rtc->gpio_data)) {
+		dev_err(&pdev->dev, "invalid gpio (data): %d\n",
+			moxart_rtc->gpio_data);
+		return moxart_rtc->gpio_data;
+	}
+
+	moxart_rtc->gpio_sclk = of_get_named_gpio(pdev->dev.of_node,
+						  "gpio-rtc-sclk", 0);
+	if (!gpio_is_valid(moxart_rtc->gpio_sclk)) {
+		dev_err(&pdev->dev, "invalid gpio (sclk): %d\n",
+			moxart_rtc->gpio_sclk);
+		return moxart_rtc->gpio_sclk;
+	}
+
+	moxart_rtc->gpio_reset = of_get_named_gpio(pdev->dev.of_node,
+						   "gpio-rtc-reset", 0);
+	if (!gpio_is_valid(moxart_rtc->gpio_reset)) {
+		dev_err(&pdev->dev, "invalid gpio (reset): %d\n",
+			moxart_rtc->gpio_reset);
+		return moxart_rtc->gpio_reset;
+	}
+
+	spin_lock_init(&moxart_rtc->rtc_lock);
+	platform_set_drvdata(pdev, moxart_rtc);
+
+	ret = devm_gpio_request(&pdev->dev, moxart_rtc->gpio_data, "rtc_data");
+	if (ret) {
+		dev_err(&pdev->dev, "can't get rtc_data gpio\n");
+		return ret;
+	}
+
+	ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_sclk,
+				    GPIOF_DIR_OUT, "rtc_sclk");
+	if (ret) {
+		dev_err(&pdev->dev, "can't get rtc_sclk gpio\n");
+		return ret;
+	}
+
+	ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_reset,
+				    GPIOF_DIR_OUT, "rtc_reset");
+	if (ret) {
+		dev_err(&pdev->dev, "can't get rtc_reset gpio\n");
+		return ret;
+	}
+
+	moxart_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name,
+						   &moxart_rtc_ops,
+						   THIS_MODULE);
+	if (IS_ERR(moxart_rtc->rtc)) {
+		dev_err(&pdev->dev, "devm_rtc_device_register failed\n");
+		return PTR_ERR(moxart_rtc->rtc);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id moxart_rtc_match[] = {
+	{ .compatible = "moxa,moxart-rtc" },
+	{ },
+};
+
+static struct platform_driver moxart_rtc_driver = {
+	.probe	= moxart_rtc_probe,
+	.driver	= {
+		.name		= "moxart-rtc",
+		.owner		= THIS_MODULE,
+		.of_match_table	= moxart_rtc_match,
+	},
+};
+module_platform_driver(moxart_rtc_driver);
+
+MODULE_DESCRIPTION("MOXART RTC driver");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Jonas Jensen <jonas.jensen@gmail.com>");

From 8af750e3f5ca21eaa5595a96a4cf5eaa996deed4 Mon Sep 17 00:00:00 2001
From: Hebbar Gururaja <gururaja.hebbar@ti.com>
Date: Wed, 11 Sep 2013 14:24:18 -0700
Subject: [PATCH 208/303] drivers/rtc/rtc-omap.c: add rtc wakeup support to
 alarm events

On some platforms (like AM33xx), a special register (RTC_IRQWAKEEN) is
available to enable Alarm Wakeup feature.  This register needs to be
properly handled for the rtcwake to work properly.

Platforms using such IP should set "ti,am3352-rtc" in rtc device dt
compatibility node.

Signed-off-by: Hebbar Gururaja <gururaja.hebbar@ti.com>
Acked-by: Kevin Hilman <khilman@linaro.org>
Acked-by: Sekhar Nori <nsekhar@ti.com>
Cc: Grant Likely <grant.likely@linaro.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Rob Landley <rob@landley.net>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../devicetree/bindings/rtc/rtc-omap.txt      |  6 +-
 drivers/rtc/rtc-omap.c                        | 60 ++++++++++++++++---
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/Documentation/devicetree/bindings/rtc/rtc-omap.txt
index b47aa415c820..5a0f02d34d95 100644
--- a/Documentation/devicetree/bindings/rtc/rtc-omap.txt
+++ b/Documentation/devicetree/bindings/rtc/rtc-omap.txt
@@ -1,7 +1,11 @@
 TI Real Time Clock
 
 Required properties:
-- compatible: "ti,da830-rtc"
+- compatible:
+	- "ti,da830-rtc"  - for RTC IP used similar to that on DA8xx SoC family.
+	- "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family.
+			    This RTC IP has special WAKE-EN Register to enable
+			    Wakeup generation for event Alarm.
 - reg: Address range of rtc register set
 - interrupts: rtc timer, alarm interrupts in order
 - interrupt-parent: phandle for the interrupt controller
diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c
index c6ffbaec32a4..c7d97ee59327 100644
--- a/drivers/rtc/rtc-omap.c
+++ b/drivers/rtc/rtc-omap.c
@@ -70,6 +70,8 @@
 #define OMAP_RTC_KICK0_REG		0x6c
 #define OMAP_RTC_KICK1_REG		0x70
 
+#define OMAP_RTC_IRQWAKEEN		0x7c
+
 /* OMAP_RTC_CTRL_REG bit fields: */
 #define OMAP_RTC_CTRL_SPLIT		(1<<7)
 #define OMAP_RTC_CTRL_DISABLE		(1<<6)
@@ -94,12 +96,21 @@
 #define OMAP_RTC_INTERRUPTS_IT_ALARM    (1<<3)
 #define OMAP_RTC_INTERRUPTS_IT_TIMER    (1<<2)
 
+/* OMAP_RTC_IRQWAKEEN bit fields: */
+#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN    (1<<1)
+
 /* OMAP_RTC_KICKER values */
 #define	KICK0_VALUE			0x83e70b13
 #define	KICK1_VALUE			0x95a4f1e0
 
 #define	OMAP_RTC_HAS_KICKER		0x1
 
+/*
+ * Few RTC IP revisions has special WAKE-EN Register to enable Wakeup
+ * generation for event Alarm.
+ */
+#define	OMAP_RTC_HAS_IRQWAKEEN		0x2
+
 static void __iomem	*rtc_base;
 
 #define rtc_read(addr)		readb(rtc_base + (addr))
@@ -299,12 +310,18 @@ static struct rtc_class_ops omap_rtc_ops = {
 static int omap_rtc_alarm;
 static int omap_rtc_timer;
 
-#define	OMAP_RTC_DATA_DA830_IDX	1
+#define	OMAP_RTC_DATA_AM3352_IDX	1
+#define	OMAP_RTC_DATA_DA830_IDX		2
 
 static struct platform_device_id omap_rtc_devtype[] = {
 	{
 		.name	= DRIVER_NAME,
-	}, {
+	},
+	[OMAP_RTC_DATA_AM3352_IDX] = {
+		.name	= "am3352-rtc",
+		.driver_data = OMAP_RTC_HAS_KICKER | OMAP_RTC_HAS_IRQWAKEEN,
+	},
+	[OMAP_RTC_DATA_DA830_IDX] = {
 		.name	= "da830-rtc",
 		.driver_data = OMAP_RTC_HAS_KICKER,
 	},
@@ -316,6 +333,9 @@ static const struct of_device_id omap_rtc_of_match[] = {
 	{	.compatible	= "ti,da830-rtc",
 		.data		= &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX],
 	},
+	{	.compatible	= "ti,am3352-rtc",
+		.data		= &omap_rtc_devtype[OMAP_RTC_DATA_AM3352_IDX],
+	},
 	{},
 };
 MODULE_DEVICE_TABLE(of, omap_rtc_of_match);
@@ -464,16 +484,28 @@ static u8 irqstat;
 
 static int omap_rtc_suspend(struct device *dev)
 {
+	u8 irqwake_stat;
+	struct platform_device *pdev = to_platform_device(dev);
+	const struct platform_device_id *id_entry =
+					platform_get_device_id(pdev);
+
 	irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG);
 
 	/* FIXME the RTC alarm is not currently acting as a wakeup event
-	 * source, and in fact this enable() call is just saving a flag
-	 * that's never used...
+	 * source on some platforms, and in fact this enable() call is just
+	 * saving a flag that's never used...
 	 */
-	if (device_may_wakeup(dev))
+	if (device_may_wakeup(dev)) {
 		enable_irq_wake(omap_rtc_alarm);
-	else
+
+		if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) {
+			irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN);
+			irqwake_stat |= OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN;
+			rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN);
+		}
+	} else {
 		rtc_write(0, OMAP_RTC_INTERRUPTS_REG);
+	}
 
 	/* Disable the clock/module */
 	pm_runtime_put_sync(dev);
@@ -483,13 +515,25 @@ static int omap_rtc_suspend(struct device *dev)
 
 static int omap_rtc_resume(struct device *dev)
 {
+	u8 irqwake_stat;
+	struct platform_device *pdev = to_platform_device(dev);
+	const struct platform_device_id *id_entry =
+				platform_get_device_id(pdev);
+
 	/* Enable the clock/module so that we can access the registers */
 	pm_runtime_get_sync(dev);
 
-	if (device_may_wakeup(dev))
+	if (device_may_wakeup(dev)) {
 		disable_irq_wake(omap_rtc_alarm);
-	else
+
+		if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) {
+			irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN);
+			irqwake_stat &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN;
+			rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN);
+		}
+	} else {
 		rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG);
+	}
 	return 0;
 }
 #endif

From 666a584d3a765a914642f80deef7a33fb309df5d Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 11 Sep 2013 14:24:19 -0700
Subject: [PATCH 209/303] drivers/rtc/rtc-palmas.c: support for backup battery
 charging

Palmas series device like TPS65913, TPS80036 supports the backup battery
for powering the RTC when no other energy source is available.

The backup battery is optional, connected to the VBACKUP pin, and can be
nonrechargeable or rechargeable.  The rechargeable battery can be charged
from the system supply using the backup battery charger.

Add support for enabling charging of this backup battery.  Also add the DT
binding document and the new properties to have this support.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Reviewed-by: Felipe Balbi <balbi@ti.com>
Acked-by: Kumar Gala <galak@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../devicetree/bindings/rtc/rtc-palmas.txt    | 33 +++++++++++++++++
 drivers/rtc/rtc-palmas.c                      | 35 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/rtc/rtc-palmas.txt

diff --git a/Documentation/devicetree/bindings/rtc/rtc-palmas.txt b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt
new file mode 100644
index 000000000000..adbccc0a51e1
--- /dev/null
+++ b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt
@@ -0,0 +1,33 @@
+Palmas RTC controller bindings
+
+Required properties:
+- compatible:
+  - "ti,palmas-rtc" for palma series of the RTC controller
+- interrupt-parent: Parent interrupt device, must be handle of palmas node.
+- interrupts: Interrupt number of RTC submodule on device.
+
+Optional properties:
+
+- ti,backup-battery-chargeable: The Palmas series device like TPS65913 or
+	TPS80036 supports the backup battery for powering the RTC when main
+	battery is removed or in very low power state. The backup battery
+	can be chargeable or non-chargeable. This flag will tells whether
+	battery is chargeable or not. If charging battery then driver can
+	enable the charging.
+- ti,backup-battery-charge-high-current: Enable high current charging in
+	backup battery. Device supports the < 100mA and > 100mA charging.
+	The high current will be > 100mA. Absence of this property will
+	charge battery to lower current i.e. < 100mA.
+
+Example:
+	palmas: tps65913@58 {
+		...
+		palmas_rtc: rtc {
+			compatible = "ti,palmas-rtc";
+			interrupt-parent = <&palmas>;
+			interrupts = <8 0>;
+			ti,backup-battery-chargeable;
+			ti,backup-battery-charge-high-current;
+		};
+		...
+	};
diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c
index a1fecc8d97fc..fffb7d3449d7 100644
--- a/drivers/rtc/rtc-palmas.c
+++ b/drivers/rtc/rtc-palmas.c
@@ -238,6 +238,15 @@ static int palmas_rtc_probe(struct platform_device *pdev)
 	struct palmas *palmas = dev_get_drvdata(pdev->dev.parent);
 	struct palmas_rtc *palmas_rtc = NULL;
 	int ret;
+	bool enable_bb_charging = false;
+	bool high_bb_charging;
+
+	if (pdev->dev.of_node) {
+		enable_bb_charging = of_property_read_bool(pdev->dev.of_node,
+					"ti,backup-battery-chargeable");
+		high_bb_charging = of_property_read_bool(pdev->dev.of_node,
+					"ti,backup-battery-charge-high-current");
+	}
 
 	palmas_rtc = devm_kzalloc(&pdev->dev, sizeof(struct palmas_rtc),
 			GFP_KERNEL);
@@ -254,6 +263,32 @@ static int palmas_rtc_probe(struct platform_device *pdev)
 	palmas_rtc->dev = &pdev->dev;
 	platform_set_drvdata(pdev, palmas_rtc);
 
+	if (enable_bb_charging) {
+		unsigned reg = PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG;
+
+		if (high_bb_charging)
+			reg = 0;
+
+		ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE,
+			PALMAS_BACKUP_BATTERY_CTRL,
+			PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG, reg);
+		if (ret < 0) {
+			dev_err(&pdev->dev,
+				"BACKUP_BATTERY_CTRL update failed, %d\n", ret);
+			return ret;
+		}
+
+		ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE,
+			PALMAS_BACKUP_BATTERY_CTRL,
+			PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN,
+			PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN);
+		if (ret < 0) {
+			dev_err(&pdev->dev,
+				"BACKUP_BATTERY_CTRL update failed, %d\n", ret);
+			return ret;
+		}
+	}
+
 	/* Start RTC */
 	ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG,
 			PALMAS_RTC_CTRL_REG_STOP_RTC,

From 7707bda3ee4aa4415090755c22e5a9c03fc530dc Mon Sep 17 00:00:00 2001
From: Alexander Holler <holler@ahsoftware.de>
Date: Wed, 11 Sep 2013 14:24:20 -0700
Subject: [PATCH 210/303] drivers/rtc/rtc-hid-sensor-time.c: improve error
 handling when rtc register fails

Stop processing hid input when registering the RTC fails and handle a NULL
returned from devm_rtc_device_register() as a failure too.

Signed-off-by: Alexander Holler <holler@ahsoftware.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-hid-sensor-time.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c
index b5a2874b15ef..4e2a81854f51 100644
--- a/drivers/rtc/rtc-hid-sensor-time.c
+++ b/drivers/rtc/rtc-hid-sensor-time.c
@@ -279,9 +279,11 @@ static int hid_time_probe(struct platform_device *pdev)
 					"hid-sensor-time", &hid_time_rtc_ops,
 					THIS_MODULE);
 
-	if (IS_ERR(time_state->rtc)) {
+	if (IS_ERR_OR_NULL(time_state->rtc)) {
+		ret = time_state->rtc ? PTR_ERR(time_state->rtc) : -ENODEV;
+		time_state->rtc = NULL;
+		sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME);
 		dev_err(&pdev->dev, "rtc device register failed!\n");
-		return PTR_ERR(time_state->rtc);
 	}
 
 	return ret;

From 1748cbf7f7c464593232cde914f5a103181a83b5 Mon Sep 17 00:00:00 2001
From: Sangjung Woo <sangjung.woo@samsung.com>
Date: Wed, 11 Sep 2013 14:24:21 -0700
Subject: [PATCH 211/303] drivers/rtc/rtc-max77686.c: Fix wrong register

Fix a read of the wrong register when checking whether the RTC timer has
reached the alarm time.

Signed-off-by: Sangjung Woo <sangjung.woo@samsung.com>
Signed-off-by: Myugnjoo Ham <myungjoo.ham@samsung.com>
Reviewed-by: Jonghwa Lee <jonghwa3.lee@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-max77686.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c
index 9915cb96014b..9efe118a28ba 100644
--- a/drivers/rtc/rtc-max77686.c
+++ b/drivers/rtc/rtc-max77686.c
@@ -240,9 +240,9 @@ static int max77686_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm)
 	}
 
 	alrm->pending = 0;
-	ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS1, &val);
+	ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS2, &val);
 	if (ret < 0) {
-		dev_err(info->dev, "%s:%d fail to read status1 reg(%d)\n",
+		dev_err(info->dev, "%s:%d fail to read status2 reg(%d)\n",
 				__func__, __LINE__, ret);
 		goto out;
 	}

From 0ebbf4397664d66f3e35503f2f3778a5e6377cbf Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:24:22 -0700
Subject: [PATCH 212/303] drivers/rtc/rtc-nuc900.c: use NULL instead of 0

check_rtc_access_enable() returns pointer, thus NULL should be used
instead of 0 in order to fix the following sparse warning:

  drivers/rtc/rtc-nuc900.c:102:16: warning: Using plain integer as NULL pointer

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-nuc900.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c
index 22861c5e0c59..248653c74b80 100644
--- a/drivers/rtc/rtc-nuc900.c
+++ b/drivers/rtc/rtc-nuc900.c
@@ -99,7 +99,7 @@ static int *check_rtc_access_enable(struct nuc900_rtc *nuc900_rtc)
 	if (!timeout)
 		return ERR_PTR(-EPERM);
 
-	return 0;
+	return NULL;
 }
 
 static int nuc900_rtc_bcd2bin(unsigned int timereg,

From 28984c7d94c27b993d09d4f2a1a2c36bfd26fd23 Mon Sep 17 00:00:00 2001
From: Xianglong Du <Xianglong.Du@csr.com>
Date: Wed, 11 Sep 2013 14:24:23 -0700
Subject: [PATCH 213/303] drivers/rtc/rtc-sirfsoc.c: fix kernel warning during
 wakeup

enable_irq_wake() might fail, if so, we will see kernel warning in resume
entries due to it always calls disable_irq_wake().

  WARNING: at kernel/irq/manage.c:529 irq_set_irq_wake+0xc4/0xf0()
  Unbalanced IRQ 52 wake disable
  Modules linked in: ipv6 libcomposite configfs
  CPU: 0 PID: 1591 Comm: ash Tainted: G        W    3.10.0-00854-gdbd86d4-dirty #100
    (unwind_backtrace+0x0/0xf8) from (show_stack+0x10/0x14)
    (show_stack+0x10/0x14) from (warn_slowpath_common+0x54/0x68)
    (warn_slowpath_common+0x54/0x68) from (warn_slowpath_fmt+0x30/0x40)
    (warn_slowpath_fmt+0x30/0x40) from (irq_set_irq_wake+0xc4/0xf0)
    (irq_set_irq_wake+0xc4/0xf0) from (sirfsoc_rtc_restore+0x30/0x38)
    (sirfsoc_rtc_restore+0x30/0x38) from (platform_pm_restore+0x2c/0x50)
    (platform_pm_restore+0x2c/0x50) from (dpm_run_callback.clone.6+0x30/0xb0)
    (dpm_run_callback.clone.6+0x30/0xb0) from (device_resume+0x88/0x134)
    (device_resume+0x88/0x134) from (dpm_resume+0x114/0x230)
    (dpm_resume+0x114/0x230) from (hibernation_snapshot+0x178/0x1d0)
    (hibernation_snapshot+0x178/0x1d0) from (hibernate+0x130/0x1dc)
    (hibernate+0x130/0x1dc) from (state_store+0xb4/0xc0)
    (state_store+0xb4/0xc0) from (kobj_attr_store+0x14/0x20)
    (kobj_attr_store+0x14/0x20) from (sysfs_write_file+0xfc/0x17c)
    (sysfs_write_file+0xfc/0x17c) from (vfs_write+0xc8/0x194)
    (vfs_write+0xc8/0x194) from (SyS_write+0x40/0x6c)
    (SyS_write+0x40/0x6c) from (ret_fast_syscall+0x0/0x30)

To avoid unbalanced "IRQ wake disable", ensure that disable_irq_wake() is
called only when enable_irq_wake() have been successfully enabled.

Signed-off-by: Xianglong Du <Xianglong.Du@csr.com>
Signed-off-by: Barry Song <Baohua.Song@csr.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-sirfsoc.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c
index aa7ed4b5f7f0..63460cf80f1b 100644
--- a/drivers/rtc/rtc-sirfsoc.c
+++ b/drivers/rtc/rtc-sirfsoc.c
@@ -44,6 +44,7 @@ struct sirfsoc_rtc_drv {
 	struct rtc_device	*rtc;
 	u32			rtc_base;
 	u32			irq;
+	unsigned		irq_wake;
 	/* Overflow for every 8 years extra time */
 	u32			overflow_rtc;
 #ifdef CONFIG_PM
@@ -355,8 +356,8 @@ static int sirfsoc_rtc_suspend(struct device *dev)
 	rtcdrv->saved_counter =
 		sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN);
 	rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc;
-	if (device_may_wakeup(&pdev->dev))
-		enable_irq_wake(rtcdrv->irq);
+	if (device_may_wakeup(&pdev->dev) && !enable_irq_wake(rtcdrv->irq))
+		rtcdrv->irq_wake = 1;
 
 	return 0;
 }
@@ -423,8 +424,10 @@ static int sirfsoc_rtc_resume(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev);
 	sirfsoc_rtc_thaw(dev);
-	if (device_may_wakeup(&pdev->dev))
+	if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) {
 		disable_irq_wake(rtcdrv->irq);
+		rtcdrv->irq_wake = 0;
+	}
 
 	return 0;
 }
@@ -434,8 +437,10 @@ static int sirfsoc_rtc_restore(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev);
 
-	if (device_may_wakeup(&pdev->dev))
+	if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) {
 		disable_irq_wake(rtcdrv->irq);
+		rtcdrv->irq_wake = 0;
+	}
 	return 0;
 }
 

From 25e2818e385cf3e3198599307f08e044a7c1be97 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Wed, 11 Sep 2013 14:24:24 -0700
Subject: [PATCH 214/303] drivers/rtc/rtc-ds1742.c: use devm_ioremap_resource()

Replace devm_request_mem_region() and devm_ioremap() with
devm_ioremap_resource().

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Cc: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-ds1742.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index eccdc62ae1c0..9fba0ae2e768 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -56,7 +56,6 @@ struct rtc_plat_data {
 	void __iomem *ioaddr_nvram;
 	void __iomem *ioaddr_rtc;
 	size_t size_nvram;
-	size_t size;
 	unsigned long last_jiffies;
 	struct bin_attribute nvram_attr;
 };
@@ -168,22 +167,17 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 	void __iomem *ioaddr;
 	int ret = 0;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
-	pdata->size = resource_size(res);
-	if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size,
-		pdev->name))
-		return -EBUSY;
-	ioaddr = devm_ioremap(&pdev->dev, res->start, pdata->size);
-	if (!ioaddr)
-		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ioaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ioaddr))
+		return PTR_ERR(ioaddr);
 
 	pdata->ioaddr_nvram = ioaddr;
-	pdata->size_nvram = pdata->size - RTC_SIZE;
+	pdata->size_nvram = resource_size(res) - RTC_SIZE;
 	pdata->ioaddr_rtc = ioaddr + pdata->size_nvram;
 
 	sysfs_bin_attr_init(&pdata->nvram_attr);

From 2cbc21877adf6adc69ca1fbc5fd1edbd7e1e3c59 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Wed, 11 Sep 2013 14:24:25 -0700
Subject: [PATCH 215/303] drivers/rtc/rtc-ds1742.c: remove unused field "rtc"
 from private structure

Private field "rtc" is not used outside "probe", so there is no reason to
keep it.

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Cc: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-ds1742.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 9fba0ae2e768..139934ff14fc 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -52,7 +52,6 @@
 #define RTC_BATT_FLAG		0x80
 
 struct rtc_plat_data {
-	struct rtc_device *rtc;
 	void __iomem *ioaddr_nvram;
 	void __iomem *ioaddr_rtc;
 	size_t size_nvram;
@@ -206,7 +205,6 @@ static int ds1742_rtc_probe(struct platform_device *pdev)
 				  &ds1742_rtc_ops, THIS_MODULE);
 	if (IS_ERR(rtc))
 		return PTR_ERR(rtc);
-	pdata->rtc = rtc;
 
 	ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr);
 

From 1735be4b822e8e3808f461372ff3942824790172 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Wed, 11 Sep 2013 14:24:26 -0700
Subject: [PATCH 216/303] drivers/rtc/rtc-ds1742.c: report to RTC core if
 retrieved time is invalid

Let RTC core decide if the retrieved time is invalid, instead of
processing errors in the driver.

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Cc: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-ds1742.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c
index 139934ff14fc..17b73fdc3b6e 100644
--- a/drivers/rtc/rtc-ds1742.c
+++ b/drivers/rtc/rtc-ds1742.c
@@ -115,11 +115,7 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm)
 	/* year is 1900 + tm->tm_year */
 	tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900;
 
-	if (rtc_valid_tm(tm) < 0) {
-		dev_err(dev, "retrieved date/time is not valid.\n");
-		rtc_time_to_tm(0, tm);
-	}
-	return 0;
+	return rtc_valid_tm(tm);
 }
 
 static const struct rtc_class_ops ds1742_rtc_ops = {

From 7c1d69ee11b8986c40a53d8e2238204fc86f5b33 Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Wed, 11 Sep 2013 14:24:27 -0700
Subject: [PATCH 217/303] rtc: simplify devm_request_mem_region/devm_ioremap

Convert the composition of devm_request_mem_region and devm_ioremap to a
single call to devm_ioremap_resource.  The associated call to
platform_get_resource is also simplified and moved next to the new call
to devm_ioremap_resource.

This was done using a combination of the semantic patches
devm_ioremap_resource.cocci and devm_request_and_ioremap.cocci, found in
the scripts/coccinelle/api directory.

In rtc-lpc32xx.c and rtc-mv.c, the local variable size is no longer needed.

In rtc-ds1511.c the size field of the local structure is not useful any
more, and is deleted.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/rtc-ds1511.c   | 17 +++++------------
 drivers/rtc/rtc-ds1553.c   | 13 ++++---------
 drivers/rtc/rtc-ep93xx.c   | 14 +++-----------
 drivers/rtc/rtc-imxdi.c    | 16 ++++------------
 drivers/rtc/rtc-lpc32xx.c  | 24 ++++--------------------
 drivers/rtc/rtc-mv.c       | 17 ++++-------------
 drivers/rtc/rtc-mxc.c      | 14 ++++----------
 drivers/rtc/rtc-stk17ta8.c | 15 +++++----------
 drivers/rtc/rtc-tx4939.c   | 14 ++++----------
 9 files changed, 37 insertions(+), 107 deletions(-)

diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c
index 308a8fefe76f..bc7b4fcf603c 100644
--- a/drivers/rtc/rtc-ds1511.c
+++ b/drivers/rtc/rtc-ds1511.c
@@ -89,7 +89,6 @@ enum ds1511reg {
 struct rtc_plat_data {
 	struct rtc_device *rtc;
 	void __iomem *ioaddr;		/* virtual base address */
-	int size;				/* amount of memory mapped */
 	int irq;
 	unsigned int irqen;
 	int alrm_sec;
@@ -479,20 +478,14 @@ static int ds1511_rtc_probe(struct platform_device *pdev)
 	struct rtc_plat_data *pdata;
 	int ret = 0;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
-	pdata->size = resource_size(res);
-	if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size,
-			pdev->name))
-		return -EBUSY;
-	ds1511_base = devm_ioremap(&pdev->dev, res->start, pdata->size);
-	if (!ds1511_base)
-		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ds1511_base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ds1511_base))
+		return PTR_ERR(ds1511_base);
 	pdata->ioaddr = ds1511_base;
 	pdata->irq = platform_get_irq(pdev, 0);
 
diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c
index 8c6c952e90b1..fd31571941f5 100644
--- a/drivers/rtc/rtc-ds1553.c
+++ b/drivers/rtc/rtc-ds1553.c
@@ -285,19 +285,14 @@ static int ds1553_rtc_probe(struct platform_device *pdev)
 	void __iomem *ioaddr;
 	int ret = 0;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
-	if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE,
-			pdev->name))
-		return -EBUSY;
 
-	ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE);
-	if (!ioaddr)
-		return -ENOMEM;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ioaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ioaddr))
+		return PTR_ERR(ioaddr);
 	pdata->ioaddr = ioaddr;
 	pdata->irq = platform_get_irq(pdev, 0);
 
diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c
index 549b3c3792d2..580e7b56bde8 100644
--- a/drivers/rtc/rtc-ep93xx.c
+++ b/drivers/rtc/rtc-ep93xx.c
@@ -138,17 +138,9 @@ static int ep93xx_rtc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENXIO;
-
-	if (!devm_request_mem_region(&pdev->dev, res->start,
-				     resource_size(res), pdev->name))
-		return -EBUSY;
-
-	ep93xx_rtc->mmio_base = devm_ioremap(&pdev->dev, res->start,
-					     resource_size(res));
-	if (!ep93xx_rtc->mmio_base)
-		return -ENXIO;
+	ep93xx_rtc->mmio_base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ep93xx_rtc->mmio_base))
+		return PTR_ERR(ep93xx_rtc->mmio_base);
 
 	pdev->dev.platform_data = ep93xx_rtc;
 	platform_set_drvdata(pdev, ep93xx_rtc);
diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c
index d3a8c8e255de..abd7f9091f34 100644
--- a/drivers/rtc/rtc-imxdi.c
+++ b/drivers/rtc/rtc-imxdi.c
@@ -375,24 +375,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev)
 	struct imxdi_dev *imxdi;
 	int rc;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
 	imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL);
 	if (!imxdi)
 		return -ENOMEM;
 
 	imxdi->pdev = pdev;
 
-	if (!devm_request_mem_region(&pdev->dev, res->start, resource_size(res),
-				pdev->name))
-		return -EBUSY;
-
-	imxdi->ioaddr = devm_ioremap(&pdev->dev, res->start,
-			resource_size(res));
-	if (imxdi->ioaddr == NULL)
-		return -ENOMEM;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	imxdi->ioaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(imxdi->ioaddr))
+		return PTR_ERR(imxdi->ioaddr);
 
 	spin_lock_init(&imxdi->irq_lock);
 
diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c
index 8276ae94a2a9..bfdbcb82d069 100644
--- a/drivers/rtc/rtc-lpc32xx.c
+++ b/drivers/rtc/rtc-lpc32xx.c
@@ -201,16 +201,9 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct lpc32xx_rtc *rtc;
-	resource_size_t size;
 	int rtcirq;
 	u32 tmp;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res) {
-		dev_err(&pdev->dev, "Can't get memory resource\n");
-		return -ENOENT;
-	}
-
 	rtcirq = platform_get_irq(pdev, 0);
 	if (rtcirq < 0 || rtcirq >= NR_IRQS) {
 		dev_warn(&pdev->dev, "Can't get interrupt resource\n");
@@ -224,19 +217,10 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev)
 	}
 	rtc->irq = rtcirq;
 
-	size = resource_size(res);
-
-	if (!devm_request_mem_region(&pdev->dev, res->start, size,
-				     pdev->name)) {
-		dev_err(&pdev->dev, "RTC registers are not free\n");
-		return -EBUSY;
-	}
-
-	rtc->rtc_base = devm_ioremap(&pdev->dev, res->start, size);
-	if (!rtc->rtc_base) {
-		dev_err(&pdev->dev, "Can't map memory\n");
-		return -ENOMEM;
-	}
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	rtc->rtc_base = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(rtc->rtc_base))
+		return PTR_ERR(rtc->rtc_base);
 
 	spin_lock_init(&rtc->lock);
 
diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c
index baab802f2153..d536c5962c99 100644
--- a/drivers/rtc/rtc-mv.c
+++ b/drivers/rtc/rtc-mv.c
@@ -221,26 +221,17 @@ static int __init mv_rtc_probe(struct platform_device *pdev)
 {
 	struct resource *res;
 	struct rtc_plat_data *pdata;
-	resource_size_t size;
 	u32 rtc_time;
 	int ret = 0;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
 
-	size = resource_size(res);
-	if (!devm_request_mem_region(&pdev->dev, res->start, size,
-				     pdev->name))
-		return -EBUSY;
-
-	pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, size);
-	if (!pdata->ioaddr)
-		return -ENOMEM;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(pdata->ioaddr))
+		return PTR_ERR(pdata->ioaddr);
 
 	pdata->clk = devm_clk_get(&pdev->dev, NULL);
 	/* Not all SoCs require a clock.*/
diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c
index ab87bacb8f88..50c572645546 100644
--- a/drivers/rtc/rtc-mxc.c
+++ b/drivers/rtc/rtc-mxc.c
@@ -377,22 +377,16 @@ static int mxc_rtc_probe(struct platform_device *pdev)
 	unsigned long rate;
 	int ret;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
 
 	pdata->devtype = pdev->id_entry->driver_data;
 
-	if (!devm_request_mem_region(&pdev->dev, res->start,
-				     resource_size(res), pdev->name))
-		return -EBUSY;
-
-	pdata->ioaddr = devm_ioremap(&pdev->dev, res->start,
-				     resource_size(res));
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(pdata->ioaddr))
+		return PTR_ERR(pdata->ioaddr);
 
 	pdata->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(pdata->clk)) {
diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c
index af5e97e3f272..a176ba614683 100644
--- a/drivers/rtc/rtc-stk17ta8.c
+++ b/drivers/rtc/rtc-stk17ta8.c
@@ -294,19 +294,14 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev)
 	void __iomem *ioaddr;
 	int ret = 0;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
-
 	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 	if (!pdata)
 		return -ENOMEM;
-	if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE,
-			pdev->name))
-		return -EBUSY;
-	ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE);
-	if (!ioaddr)
-		return -ENOMEM;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	ioaddr = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(ioaddr))
+		return PTR_ERR(ioaddr);
 	pdata->ioaddr = ioaddr;
 	pdata->irq = platform_get_irq(pdev, 0);
 
diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c
index f9a0677e4e3b..4f87234e0dee 100644
--- a/drivers/rtc/rtc-tx4939.c
+++ b/drivers/rtc/rtc-tx4939.c
@@ -244,9 +244,6 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 	struct resource *res;
 	int irq, ret;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	if (!res)
-		return -ENODEV;
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0)
 		return -ENODEV;
@@ -255,13 +252,10 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	platform_set_drvdata(pdev, pdata);
 
-	if (!devm_request_mem_region(&pdev->dev, res->start,
-				     resource_size(res), pdev->name))
-		return -EBUSY;
-	pdata->rtcreg = devm_ioremap(&pdev->dev, res->start,
-				     resource_size(res));
-	if (!pdata->rtcreg)
-		return -EBUSY;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	pdata->rtcreg = devm_ioremap_resource(&pdev->dev, res);
+	if (IS_ERR(pdata->rtcreg))
+		return PTR_ERR(pdata->rtcreg);
 
 	spin_lock_init(&pdata->lock);
 	tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP);

From 2c92057e45c2d60f859ca3606cfd402c48785d82 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Wed, 11 Sep 2013 14:24:28 -0700
Subject: [PATCH 218/303] hfsplus: add necessary declarations for POSIX ACLs
 support

This patchset implements POSIX ACLs support in hfsplus driver.

Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, which
are part of the NFSv4 standard.  HFS+ stores ACLs in the form of
specially named extended attributes (com.apple.system.Security).

But this patchset doesn't use "com.apple.system.Security" extended
attributes.  It implements support of POSIX ACLs in the form of extended
attributes with names "system.posix_acl_access" and
"system.posix_acl_default".  These xattrs are treated only under Linux.
POSIX ACLs doesn't mean something under Mac OS X.  Thereby, this patch
set provides opportunity to use POSIX ACLs under Linux on HFS+
filesystem.

This patch:

Add CONFIG_HFSPLUS_FS_POSIX_ACL kernel configuration option, DBG_ACL_MOD
debugging flag and acl.h file with declaration of essential functions
for support POSIX ACLs in hfsplus driver.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/Kconfig      | 18 ++++++++++++++++++
 fs/hfsplus/acl.h        | 30 ++++++++++++++++++++++++++++++
 fs/hfsplus/hfsplus_fs.h |  1 +
 3 files changed, 49 insertions(+)
 create mode 100644 fs/hfsplus/acl.h

diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig
index a63371815aab..24bc20fd42f7 100644
--- a/fs/hfsplus/Kconfig
+++ b/fs/hfsplus/Kconfig
@@ -11,3 +11,21 @@ config HFSPLUS_FS
 	  MacOS 8. It includes all Mac specific filesystem data such as
 	  data forks and creator codes, but it also has several UNIX
 	  style features such as file ownership and permissions.
+
+config HFSPLUS_FS_POSIX_ACL
+	bool "HFS+ POSIX Access Control Lists"
+	depends on HFSPLUS_FS
+	select FS_POSIX_ACL
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  It needs to understand that POSIX ACLs are treated only under
+	  Linux. POSIX ACLs doesn't mean something under Mac OS X.
+	  Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs,
+	  which are part of the NFSv4 standard.
+
+	  If you don't know what Access Control Lists are, say N
diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h
new file mode 100644
index 000000000000..07c0d4947527
--- /dev/null
+++ b/fs/hfsplus/acl.h
@@ -0,0 +1,30 @@
+/*
+ * linux/fs/hfsplus/acl.h
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for Posix Access Control Lists (ACLs) support.
+ */
+
+#include <linux/posix_acl_xattr.h>
+
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+
+/* posix_acl.c */
+struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type);
+extern int hfsplus_posix_acl_chmod(struct inode *);
+extern int hfsplus_init_posix_acl(struct inode *, struct inode *);
+
+#else  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
+#define hfsplus_get_posix_acl NULL
+
+static inline int hfsplus_posix_acl_chmod(struct inode *inode)
+{
+	return 0;
+}
+
+static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
+{
+	return 0;
+}
+#endif  /* CONFIG_HFSPLUS_FS_POSIX_ACL */
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index ede79317cfb8..2b9cd01696e2 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -30,6 +30,7 @@
 #define DBG_EXTENT	0x00000020
 #define DBG_BITMAP	0x00000040
 #define DBG_ATTR_MOD	0x00000080
+#define DBG_ACL_MOD	0x00000100
 
 #if 0
 #define DBG_MASK	(DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD)

From eef80d4ad1399067f26538a7dd56ff3df71e9278 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Wed, 11 Sep 2013 14:24:29 -0700
Subject: [PATCH 219/303] hfsplus: implement POSIX ACLs support

Implement POSIX ACLs support in hfsplus driver.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/posix_acl.c | 274 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 274 insertions(+)
 create mode 100644 fs/hfsplus/posix_acl.c

diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
new file mode 100644
index 000000000000..b609cc14c72e
--- /dev/null
+++ b/fs/hfsplus/posix_acl.c
@@ -0,0 +1,274 @@
+/*
+ * linux/fs/hfsplus/posix_acl.c
+ *
+ * Vyacheslav Dubeyko <slava@dubeyko.com>
+ *
+ * Handler for Posix Access Control Lists (ACLs) support.
+ */
+
+#include "hfsplus_fs.h"
+#include "xattr.h"
+#include "acl.h"
+
+struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	char *xattr_name;
+	char *value = NULL;
+	ssize_t size;
+
+	acl = get_cached_acl(inode, type);
+	if (acl != ACL_NOT_CACHED)
+		return acl;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	size = __hfsplus_getxattr(inode, xattr_name, NULL, 0);
+
+	if (size > 0) {
+		value = (char *)hfsplus_alloc_attr_entry();
+		if (unlikely(!value))
+			return ERR_PTR(-ENOMEM);
+		size = __hfsplus_getxattr(inode, xattr_name, value, size);
+	}
+
+	if (size > 0)
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+	else if (size == -ENODATA)
+		acl = NULL;
+	else
+		acl = ERR_PTR(size);
+
+	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
+
+	return acl;
+}
+
+static int hfsplus_set_posix_acl(struct inode *inode,
+					int type,
+					struct posix_acl *acl)
+{
+	int err;
+	char *xattr_name;
+	size_t size = 0;
+	char *value = NULL;
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		if (acl) {
+			err = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (err < 0)
+				return err;
+		}
+		err = 0;
+		break;
+
+	case ACL_TYPE_DEFAULT:
+		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		if (!S_ISDIR(inode->i_mode))
+			return acl ? -EACCES : 0;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (acl) {
+		size = posix_acl_xattr_size(acl->a_count);
+		if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE))
+			return -ENOMEM;
+		value = (char *)hfsplus_alloc_attr_entry();
+		if (unlikely(!value))
+			return -ENOMEM;
+		err = posix_acl_to_xattr(&init_user_ns, acl, value, size);
+		if (unlikely(err < 0))
+			goto end_set_acl;
+	}
+
+	err = __hfsplus_setxattr(inode, xattr_name, value, size, 0);
+
+end_set_acl:
+	hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value);
+
+	if (!err)
+		set_cached_acl(inode, type, acl);
+
+	return err;
+}
+
+int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir)
+{
+	int err = 0;
+	struct posix_acl *acl = NULL;
+
+	hfs_dbg(ACL_MOD,
+		"[%s]: ino %lu, dir->ino %lu\n",
+		__func__, inode->i_ino, dir->i_ino);
+
+	if (S_ISLNK(inode->i_mode))
+		return 0;
+
+	acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+
+	if (acl) {
+		if (S_ISDIR(inode->i_mode)) {
+			err = hfsplus_set_posix_acl(inode,
+							ACL_TYPE_DEFAULT,
+							acl);
+			if (unlikely(err))
+				goto init_acl_cleanup;
+		}
+
+		err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
+		if (unlikely(err < 0))
+			return err;
+
+		if (err > 0)
+			err = hfsplus_set_posix_acl(inode,
+							ACL_TYPE_ACCESS,
+							acl);
+	} else
+		inode->i_mode &= ~current_umask();
+
+init_acl_cleanup:
+	posix_acl_release(acl);
+	return err;
+}
+
+int hfsplus_posix_acl_chmod(struct inode *inode)
+{
+	int err;
+	struct posix_acl *acl;
+
+	hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino);
+
+	if (S_ISLNK(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS);
+	if (IS_ERR(acl) || !acl)
+		return PTR_ERR(acl);
+
+	err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+	if (unlikely(err))
+		return err;
+
+	err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl);
+	posix_acl_release(acl);
+	return err;
+}
+
+static int hfsplus_xattr_get_posix_acl(struct dentry *dentry,
+					const char *name,
+					void *buffer,
+					size_t size,
+					int type)
+{
+	int err = 0;
+	struct posix_acl *acl;
+
+	hfs_dbg(ACL_MOD,
+		"[%s]: ino %lu, buffer %p, size %zu, type %#x\n",
+		__func__, dentry->d_inode->i_ino, buffer, size, type);
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	acl = hfsplus_get_posix_acl(dentry->d_inode, type);
+	if (IS_ERR(acl))
+		return PTR_ERR(acl);
+	if (acl == NULL)
+		return -ENODATA;
+
+	err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size);
+	posix_acl_release(acl);
+
+	return err;
+}
+
+static int hfsplus_xattr_set_posix_acl(struct dentry *dentry,
+					const char *name,
+					const void *value,
+					size_t size,
+					int flags,
+					int type)
+{
+	int err = 0;
+	struct inode *inode = dentry->d_inode;
+	struct posix_acl *acl = NULL;
+
+	hfs_dbg(ACL_MOD,
+		"[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n",
+		__func__, inode->i_ino, value, size, flags, type);
+
+	if (strcmp(name, "") != 0)
+		return -EINVAL;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	if (value) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		else if (acl) {
+			err = posix_acl_valid(acl);
+			if (err)
+				goto end_xattr_set_acl;
+		}
+	}
+
+	err = hfsplus_set_posix_acl(inode, type, acl);
+
+end_xattr_set_acl:
+	posix_acl_release(acl);
+	return err;
+}
+
+static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry,
+						char *list,
+						size_t list_size,
+						const char *name,
+						size_t name_len,
+						int type)
+{
+	/*
+	 * This method is not used.
+	 * It is used hfsplus_listxattr() instead of generic_listxattr().
+	 */
+	return -EOPNOTSUPP;
+}
+
+const struct xattr_handler hfsplus_xattr_acl_access_handler = {
+	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.flags	= ACL_TYPE_ACCESS,
+	.list	= hfsplus_xattr_list_posix_acl,
+	.get	= hfsplus_xattr_get_posix_acl,
+	.set	= hfsplus_xattr_set_posix_acl,
+};
+
+const struct xattr_handler hfsplus_xattr_acl_default_handler = {
+	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.flags	= ACL_TYPE_DEFAULT,
+	.list	= hfsplus_xattr_list_posix_acl,
+	.get	= hfsplus_xattr_get_posix_acl,
+	.set	= hfsplus_xattr_set_posix_acl,
+};

From b4c1107cc962613ea3572e5abba861a35d494b98 Mon Sep 17 00:00:00 2001
From: Vyacheslav Dubeyko <slava@dubeyko.com>
Date: Wed, 11 Sep 2013 14:24:30 -0700
Subject: [PATCH 220/303] hfsplus: integrate POSIX ACLs support into driver

Integrate implemented POSIX ACLs support into hfsplus driver.

Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Hin-Tak Leung <htl10@users.sourceforge.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hfsplus/Makefile         |  2 ++
 fs/hfsplus/dir.c            |  4 +++
 fs/hfsplus/inode.c          | 11 +++++++
 fs/hfsplus/xattr.c          | 62 +++++++++++++++++++++++++++++++++----
 fs/hfsplus/xattr.h          | 33 ++++++++------------
 fs/hfsplus/xattr_security.c | 13 ++++++++
 6 files changed, 99 insertions(+), 26 deletions(-)

diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile
index 09d278bb7b91..683fca2e5e65 100644
--- a/fs/hfsplus/Makefile
+++ b/fs/hfsplus/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o
 hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \
 		bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \
 		attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o
+
+hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL)	+= posix_acl.o
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d8ce4bd17fc5..4a4fea002673 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -16,6 +16,7 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 #include "xattr.h"
+#include "acl.h"
 
 static inline void hfsplus_instantiate(struct dentry *dentry,
 				       struct inode *inode, u32 cnid)
@@ -529,6 +530,9 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 	.getxattr		= generic_getxattr,
 	.listxattr		= hfsplus_listxattr,
 	.removexattr		= hfsplus_removexattr,
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	.get_acl		= hfsplus_get_posix_acl,
+#endif
 };
 
 const struct file_operations hfsplus_dir_operations = {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index f833d35630ab..4d2edaea891c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -19,6 +19,7 @@
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
 #include "xattr.h"
+#include "acl.h"
 
 static int hfsplus_readpage(struct file *file, struct page *page)
 {
@@ -316,6 +317,13 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr)
 
 	setattr_copy(inode, attr);
 	mark_inode_dirty(inode);
+
+	if (attr->ia_valid & ATTR_MODE) {
+		error = hfsplus_posix_acl_chmod(inode);
+		if (unlikely(error))
+			return error;
+	}
+
 	return 0;
 }
 
@@ -383,6 +391,9 @@ static const struct inode_operations hfsplus_file_inode_operations = {
 	.getxattr	= generic_getxattr,
 	.listxattr	= hfsplus_listxattr,
 	.removexattr	= hfsplus_removexattr,
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	.get_acl	= hfsplus_get_posix_acl,
+#endif
 };
 
 static const struct file_operations hfsplus_file_operations = {
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index f66346155df5..bd8471fb9a6a 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -8,11 +8,16 @@
 
 #include "hfsplus_fs.h"
 #include "xattr.h"
+#include "acl.h"
 
 const struct xattr_handler *hfsplus_xattr_handlers[] = {
 	&hfsplus_xattr_osx_handler,
 	&hfsplus_xattr_user_handler,
 	&hfsplus_xattr_trusted_handler,
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	&hfsplus_xattr_acl_access_handler,
+	&hfsplus_xattr_acl_default_handler,
+#endif
 	&hfsplus_xattr_security_handler,
 	NULL
 };
@@ -46,11 +51,58 @@ static inline int is_known_namespace(const char *name)
 	return true;
 }
 
+static int can_set_system_xattr(struct inode *inode, const char *name,
+				const void *value, size_t size)
+{
+#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL
+	struct posix_acl *acl;
+	int err;
+
+	if (!inode_owner_or_capable(inode))
+		return -EPERM;
+
+	/*
+	 * POSIX_ACL_XATTR_ACCESS is tied to i_mode
+	 */
+	if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		if (acl) {
+			err = posix_acl_equiv_mode(acl, &inode->i_mode);
+			posix_acl_release(acl);
+			if (err < 0)
+				return err;
+			mark_inode_dirty(inode);
+		}
+		/*
+		 * We're changing the ACL.  Get rid of the cached one
+		 */
+		forget_cached_acl(inode, ACL_TYPE_ACCESS);
+
+		return 0;
+	} else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) {
+		acl = posix_acl_from_xattr(&init_user_ns, value, size);
+		if (IS_ERR(acl))
+			return PTR_ERR(acl);
+		posix_acl_release(acl);
+
+		/*
+		 * We're changing the default ACL.  Get rid of the cached one
+		 */
+		forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+
+		return 0;
+	}
+#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */
+	return -EOPNOTSUPP;
+}
+
 static int can_set_xattr(struct inode *inode, const char *name,
 				const void *value, size_t value_len)
 {
 	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return -EOPNOTSUPP; /* TODO: implement ACL support */
+		return can_set_system_xattr(inode, name, value, value_len);
 
 	if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) {
 		/*
@@ -253,11 +305,10 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
 	return len;
 }
 
-static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry,
+static ssize_t hfsplus_getxattr_finder_info(struct inode *inode,
 						void *value, size_t size)
 {
 	ssize_t res = 0;
-	struct inode *inode = dentry->d_inode;
 	struct hfs_find_data fd;
 	u16 entry_type;
 	u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo);
@@ -304,10 +355,9 @@ end_getxattr_finder_info:
 	return res;
 }
 
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
 			 void *value, size_t size)
 {
-	struct inode *inode = dentry->d_inode;
 	struct hfs_find_data fd;
 	hfsplus_attr_entry *entry;
 	__be32 xattr_record_type;
@@ -333,7 +383,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 	}
 
 	if (!strcmp_xattr_finder_info(name))
-		return hfsplus_getxattr_finder_info(dentry, value, size);
+		return hfsplus_getxattr_finder_info(inode, value, size);
 
 	if (!HFSPLUS_SB(inode->i_sb)->attr_tree)
 		return -EOPNOTSUPP;
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index 847b695b984d..841b5698c0fc 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -14,8 +14,8 @@
 extern const struct xattr_handler hfsplus_xattr_osx_handler;
 extern const struct xattr_handler hfsplus_xattr_user_handler;
 extern const struct xattr_handler hfsplus_xattr_trusted_handler;
-/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/
-/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/
+extern const struct xattr_handler hfsplus_xattr_acl_access_handler;
+extern const struct xattr_handler hfsplus_xattr_acl_default_handler;
 extern const struct xattr_handler hfsplus_xattr_security_handler;
 
 extern const struct xattr_handler *hfsplus_xattr_handlers[];
@@ -29,9 +29,17 @@ static inline int hfsplus_setxattr(struct dentry *dentry, const char *name,
 	return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags);
 }
 
-ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
+ssize_t __hfsplus_getxattr(struct inode *inode, const char *name,
 			void *value, size_t size);
 
+static inline ssize_t hfsplus_getxattr(struct dentry *dentry,
+					const char *name,
+					void *value,
+					size_t size)
+{
+	return __hfsplus_getxattr(dentry->d_inode, name, value, size);
+}
+
 ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size);
 
 int hfsplus_removexattr(struct dentry *dentry, const char *name);
@@ -39,22 +47,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name);
 int hfsplus_init_security(struct inode *inode, struct inode *dir,
 				const struct qstr *qstr);
 
-static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir)
-{
-	/*TODO: implement*/
-	return 0;
-}
-
-static inline int hfsplus_init_inode_security(struct inode *inode,
-						struct inode *dir,
-						const struct qstr *qstr)
-{
-	int err;
-
-	err = hfsplus_init_acl(inode, dir);
-	if (!err)
-		err = hfsplus_init_security(inode, dir, qstr);
-	return err;
-}
+int hfsplus_init_inode_security(struct inode *inode, struct inode *dir,
+				const struct qstr *qstr);
 
 #endif
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index 83b842f113c5..00722765ea79 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -9,6 +9,7 @@
 #include <linux/security.h>
 #include "hfsplus_fs.h"
 #include "xattr.h"
+#include "acl.h"
 
 static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
 					void *buffer, size_t size, int type)
@@ -96,6 +97,18 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir,
 					&hfsplus_initxattrs, NULL);
 }
 
+int hfsplus_init_inode_security(struct inode *inode,
+						struct inode *dir,
+						const struct qstr *qstr)
+{
+	int err;
+
+	err = hfsplus_init_posix_acl(inode, dir);
+	if (!err)
+		err = hfsplus_init_security(inode, dir, qstr);
+	return err;
+}
+
 const struct xattr_handler hfsplus_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.list	= hfsplus_security_listxattr,

From 73af963f9f3036dffed55c3a2898598186db1045 Mon Sep 17 00:00:00 2001
From: Mark Grondona <mgrondona@llnl.gov>
Date: Wed, 11 Sep 2013 14:24:31 -0700
Subject: [PATCH 221/303] __ptrace_may_access() should not deny sub-threads

__ptrace_may_access() checks get_dumpable/ptrace_has_cap/etc if task !=
current, this can can lead to surprising results.

For example, a sub-thread can't readlink("/proc/self/exe") if the
executable is not readable.  setup_new_exec()->would_dump() notices that
inode_permission(MAY_READ) fails and then it does
set_dumpable(suid_dumpable).  After that get_dumpable() fails.

(It is not clear why proc_pid_readlink() checks get_dumpable(), perhaps we
could add PTRACE_MODE_NODUMPABLE)

Change __ptrace_may_access() to use same_thread_group() instead of "task
== current".  Any security check is pointless when the tasks share the
same ->mm.

Signed-off-by: Mark Grondona <mgrondona@llnl.gov>
Signed-off-by: Ben Woodard <woodard@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a146ee327f6a..dd562e9aa2c8 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 */
 	int dumpable = 0;
 	/* Don't let security modules deny introspection */
-	if (task == current)
+	if (same_thread_group(task, current))
 		return 0;
 	rcu_read_lock();
 	tcred = __task_cred(task);

From 65aafb1e7484b7434a0c1d4c593191ebe5776a2f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Graber?= <stgraber@ubuntu.com>
Date: Wed, 11 Sep 2013 14:24:32 -0700
Subject: [PATCH 222/303] coredump: add new %P variable in core_pattern
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a new %P variable to be used in core_pattern.  This variable contains
the global PID (PID in the init namespace) as %p contains the PID in the
current namespace which isn't always what we want.

The main use for this is to make it easier to handle crashes that happened
within a container.  With that new variables it's possible to have the
crashes dumped into the container or forwarded to the host with the right
PID (from the host's point of view).

Signed-off-by: Stéphane Graber <stgraber@ubuntu.com>
Reported-by: Hans Feldt <hans.feldt@ericsson.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Andy Whitcroft <apw@canonical.com>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/kernel.txt | 1 +
 fs/coredump.c                   | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ab7d16efa96b..9d4c1d18ad44 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -182,6 +182,7 @@ core_pattern is used to specify a core dumpfile pattern name.
 	%<NUL>	'%' is dropped
 	%%	output one '%'
 	%p	pid
+	%P	global pid (init PID namespace)
 	%u	uid
 	%g	gid
 	%d	dump mode, matches PR_SET_DUMPABLE and
diff --git a/fs/coredump.c b/fs/coredump.c
index 72f816d6cad9..9bdeca12ae0e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -190,6 +190,11 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 				err = cn_printf(cn, "%d",
 					      task_tgid_vnr(current));
 				break;
+			/* global pid */
+			case 'P':
+				err = cn_printf(cn, "%d",
+					      task_tgid_nr(current));
+				break;
 			/* uid */
 			case 'u':
 				err = cn_printf(cn, "%d", cred->uid);

From be49b30a98fe7e20f898fcfe7b6c082700fb96e8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 11 Sep 2013 14:24:34 -0700
Subject: [PATCH 223/303] fs/file_table.c:fput(): make comment more truthful

Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andrey Vagin <avagin@openvz.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/file_table.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 322cd37626cb..abdd15ad13c9 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -311,8 +311,7 @@ void fput(struct file *file)
 				return;
 			/*
 			 * After this task has run exit_task_work(),
-			 * task_work_add() will fail.  free_ipc_ns()->
-			 * shm_destroy() can do this.  Fall through to delayed
+			 * task_work_add() will fail.  Fall through to delayed
 			 * fput to avoid leaking *file.
 			 */
 		}

From 4649602265495a3bb776d777c91dba569f4afb5d Mon Sep 17 00:00:00 2001
From: Minto Joseph <mvaliyav@redhat.com>
Date: Wed, 11 Sep 2013 14:24:35 -0700
Subject: [PATCH 224/303] Documentation/filesystems/proc.txt: fix mistake in
 the description of Committed_AS

Fix mistake in the description of Committed_AS in kernel documentation.

Signed-off-by: Minto Joseph <mvaliyav@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index fcc22c982a25..823c95faebd2 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -854,16 +854,15 @@ Committed_AS: The amount of memory presently allocated on the system.
               The committed memory is a sum of all of the memory which
               has been allocated by processes, even if it has not been
               "used" by them as of yet. A process which malloc()'s 1G
-              of memory, but only touches 300M of it will only show up
-              as using 300M of memory even if it has the address space
-              allocated for the entire 1G. This 1G is memory which has
-              been "committed" to by the VM and can be used at any time
-              by the allocating application. With strict overcommit
-              enabled on the system (mode 2 in 'vm.overcommit_memory'),
-              allocations which would exceed the CommitLimit (detailed
-              above) will not be permitted. This is useful if one needs
-              to guarantee that processes will not fail due to lack of
-              memory once that memory has been successfully allocated.
+              of memory, but only touches 300M of it will show up as
+	      using 1G. This 1G is memory which has been "committed" to
+              by the VM and can be used at any time by the allocating
+              application. With strict overcommit enabled on the system
+              (mode 2 in 'vm.overcommit_memory'),allocations which would
+              exceed the CommitLimit (detailed above) will not be permitted.
+              This is useful if one needs to guarantee that processes will
+              not fail due to lack of memory once that memory has been
+              successfully allocated.
 VmallocTotal: total size of vmalloc memory area
  VmallocUsed: amount of vmalloc area which is used
 VmallocChunk: largest contiguous block of vmalloc area which is free

From a3c039929d01f793c47922017b6c0ae438e11598 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Wed, 11 Sep 2013 14:24:35 -0700
Subject: [PATCH 225/303] fs/proc/task_mmu.c: check the return value of
 mpol_to_str()

mpol_to_str() may fail, and not fill the buffer (e.g. -EINVAL), so need
check about it, or buffer may not be zero based, and next seq_printf()
will cause issue.

The failure return need after mpol_cond_put() to match get_vma_policy().

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 09228639b83d..7366e9d63cee 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1402,8 +1402,10 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	walk.mm = mm;
 
 	pol = get_vma_policy(task, vma, vma->vm_start);
-	mpol_to_str(buffer, sizeof(buffer), pol);
+	n = mpol_to_str(buffer, sizeof(buffer), pol);
 	mpol_cond_put(pol);
+	if (n < 0)
+		return n;
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 

From 96d0df79f2644fc823f26c06491e182d87a90c2a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:37 -0700
Subject: [PATCH 226/303] proc: make proc_fd_permission() thread-friendly

proc_fd_permission() says "process can still access /proc/self/fd after it
has executed a setuid()", but the "task_pid() = proc_pid() check only
helps if the task is group leader, /proc/self points to
/proc/<leader-pid>.

Change this check to use task_tgid() so that the whole thread group can
access its /proc/self/fd or /proc/<tid-of-sub-thread>/fd.

Notes:
	- CLONE_THREAD does not require CLONE_FILES so task->files
	  can differ, but I don't think this can lead to any security
	  problem. And this matches same_thread_group() in
	  __ptrace_may_access().

	- /proc/self should probably point to /proc/<thread-tid>, but
	  it is too late to change the rules. Perhaps it makes sense
	  to add /proc/thread though.

Test-case:

	void *tfunc(void *arg)
	{
		assert(opendir("/proc/self/fd"));
		return NULL;
	}

	int main(void)
	{
		pthread_t t;
		pthread_create(&t, NULL, tfunc, NULL);
		pthread_join(t, NULL);
		return 0;
	}

fails if, say, this executable is not readable and suid_dumpable = 0.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/fd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 0ff80f9b930f..985ea881b5bc 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -286,7 +286,7 @@ int proc_fd_permission(struct inode *inode, int mask)
 	int rv = generic_permission(inode, mask);
 	if (rv == 0)
 		return 0;
-	if (task_pid(current) == proc_pid(inode))
+	if (task_tgid(current) == proc_pid(inode))
 		rv = 0;
 	return rv;
 }

From 5d1baf3b63bfc8c709dc44df85ff1475c7ef489d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:38 -0700
Subject: [PATCH 227/303] exec: introduce exec_binprm() for "depth == 0" code

task_pid_nr_ns() and trace/ptrace code in the middle of the recursive
search_binary_handler() looks confusing and imho annoying.  We only need
this code if "depth == 0", lets add a simple helper which calls
search_binary_handler() and does trace_sched_process_exec() +
ptrace_event().

The patch also moves the setting of task->did_exec, we need to do this
only once.

Note: we can kill either task->did_exec or PF_FORKNOEXEC.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 2d1e52a58fe9..4d95b4709ea0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1373,7 +1373,6 @@ int search_binary_handler(struct linux_binprm *bprm)
 	unsigned int depth = bprm->recursion_depth;
 	int try,retval;
 	struct linux_binfmt *fmt;
-	pid_t old_pid, old_vpid;
 
 	/* This allows 4 levels of binfmt rewrites before failing hard. */
 	if (depth > 5)
@@ -1387,12 +1386,6 @@ int search_binary_handler(struct linux_binprm *bprm)
 	if (retval)
 		return retval;
 
-	/* Need to fetch pid before load_binary changes it */
-	old_pid = current->pid;
-	rcu_read_lock();
-	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
-	rcu_read_unlock();
-
 	retval = -ENOENT;
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
@@ -1407,16 +1400,11 @@ int search_binary_handler(struct linux_binprm *bprm)
 			retval = fn(bprm);
 			bprm->recursion_depth = depth;
 			if (retval >= 0) {
-				if (depth == 0) {
-					trace_sched_process_exec(current, old_pid, bprm);
-					ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
-				}
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
 				if (bprm->file)
 					fput(bprm->file);
 				bprm->file = NULL;
-				current->did_exec = 1;
 				proc_exec_connector(current);
 				return retval;
 			}
@@ -1450,9 +1438,29 @@ int search_binary_handler(struct linux_binprm *bprm)
 	}
 	return retval;
 }
-
 EXPORT_SYMBOL(search_binary_handler);
 
+static int exec_binprm(struct linux_binprm *bprm)
+{
+	pid_t old_pid, old_vpid;
+	int ret;
+
+	/* Need to fetch pid before load_binary changes it */
+	old_pid = current->pid;
+	rcu_read_lock();
+	old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
+	rcu_read_unlock();
+
+	ret = search_binary_handler(bprm);
+	if (ret >= 0) {
+		trace_sched_process_exec(current, old_pid, bprm);
+		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
+		current->did_exec = 1;
+	}
+
+	return ret;
+}
+
 /*
  * sys_execve() executes a new program.
  */
@@ -1541,7 +1549,7 @@ static int do_execve_common(const char *filename,
 	if (retval < 0)
 		goto out;
 
-	retval = search_binary_handler(bprm);
+	retval = exec_binprm(bprm);
 	if (retval < 0)
 		goto out;
 

From 131b2f9f1214f338f0bf7c0d9760019f2b1d0c20 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:39 -0700
Subject: [PATCH 228/303] exec: kill "int depth" in search_binary_handler()

Nobody except search_binary_handler() should touch ->recursion_depth, "int
depth" buys nothing but complicates the code, kill it.

Probably we should also kill "fn" and the !NULL check, ->load_binary
should be always defined.  And it can not go away after read_unlock() or
this code is buggy anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c               | 9 ++++-----
 include/linux/binfmts.h | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 4d95b4709ea0..b6e35ec818a2 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1370,12 +1370,11 @@ EXPORT_SYMBOL(remove_arg_zero);
  */
 int search_binary_handler(struct linux_binprm *bprm)
 {
-	unsigned int depth = bprm->recursion_depth;
-	int try,retval;
+	int try, retval;
 	struct linux_binfmt *fmt;
 
 	/* This allows 4 levels of binfmt rewrites before failing hard. */
-	if (depth > 5)
+	if (bprm->recursion_depth > 5)
 		return -ELOOP;
 
 	retval = security_bprm_check(bprm);
@@ -1396,9 +1395,9 @@ int search_binary_handler(struct linux_binprm *bprm)
 			if (!try_module_get(fmt->module))
 				continue;
 			read_unlock(&binfmt_lock);
-			bprm->recursion_depth = depth + 1;
+			bprm->recursion_depth++;
 			retval = fn(bprm);
-			bprm->recursion_depth = depth;
+			bprm->recursion_depth--;
 			if (retval >= 0) {
 				put_binfmt(fmt);
 				allow_write_access(bprm->file);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 70cf138690e9..e8112ae50531 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -31,7 +31,7 @@ struct linux_binprm {
 #ifdef __alpha__
 	unsigned int taso:1;
 #endif
-	unsigned int recursion_depth;
+	unsigned int recursion_depth; /* only for search_binary_handler() */
 	struct file * file;
 	struct cred *cred;	/* new credentials */
 	int unsafe;		/* how unsafe this exec is (mask of LSM_UNSAFE_*) */

From 9beb266f2d7e5362c5bb9f999255aa1af5318aef Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:40 -0700
Subject: [PATCH 229/303] exec: proc_exec_connector() should be called only
 once

A separate one-liner with the minor fix.

PROC_EVENT_EXEC reports the "exec" event, but this message is sent at
least twice if search_binary_handler() is called by ->load_binary()
recursively, say, load_script().

Move it to exec_binprm(), this is "depth == 0" code too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index b6e35ec818a2..d51f7172832b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1404,7 +1404,6 @@ int search_binary_handler(struct linux_binprm *bprm)
 				if (bprm->file)
 					fput(bprm->file);
 				bprm->file = NULL;
-				proc_exec_connector(current);
 				return retval;
 			}
 			read_lock(&binfmt_lock);
@@ -1455,6 +1454,7 @@ static int exec_binprm(struct linux_binprm *bprm)
 		trace_sched_process_exec(current, old_pid, bprm);
 		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
 		current->did_exec = 1;
+		proc_exec_connector(current);
 	}
 
 	return ret;

From 52f14282bb0c3d3e5ba2a9eaacb12ff37a033e7e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:41 -0700
Subject: [PATCH 230/303] exec: move allow_write_access/fput to exec_binprm()

When search_binary_handler() succeeds it does allow_write_access() and
fput(), then it clears bprm->file to ensure the caller will not do the
same.

We can simply move this code to exec_binprm() which is called only once.
In fact we could move this to free_bprm() and remove the same code in
do_execve_common's error path.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index d51f7172832b..a4cfd1d725e0 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1400,10 +1400,6 @@ int search_binary_handler(struct linux_binprm *bprm)
 			bprm->recursion_depth--;
 			if (retval >= 0) {
 				put_binfmt(fmt);
-				allow_write_access(bprm->file);
-				if (bprm->file)
-					fput(bprm->file);
-				bprm->file = NULL;
 				return retval;
 			}
 			read_lock(&binfmt_lock);
@@ -1455,6 +1451,12 @@ static int exec_binprm(struct linux_binprm *bprm)
 		ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
 		current->did_exec = 1;
 		proc_exec_connector(current);
+
+		if (bprm->file) {
+			allow_write_access(bprm->file);
+			fput(bprm->file);
+			bprm->file = NULL; /* to catch use-after-free */
+		}
 	}
 
 	return ret;

From 92eaa565add62d56b90987f58ea9feafc5a7c183 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:42 -0700
Subject: [PATCH 231/303] exec: kill ->load_binary != NULL check in
 search_binary_handler()

search_binary_handler() checks ->load_binary != NULL for no reason, this
method should be always defined.  Turn this check into WARN_ON() and move
it into __register_binfmt().

Also, kill the function pointer.  The current code looks confusing, as if
->load_binary can go away after read_unlock(&binfmt_lock).  But we rely on
module_get(fmt->module), this fmt can't be changed or unregistered,
otherwise this code is buggy anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index a4cfd1d725e0..7b92fbfa63aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock);
 void __register_binfmt(struct linux_binfmt * fmt, int insert)
 {
 	BUG_ON(!fmt);
+	if (WARN_ON(!fmt->load_binary))
+		return;
 	write_lock(&binfmt_lock);
 	insert ? list_add(&fmt->lh, &formats) :
 		 list_add_tail(&fmt->lh, &formats);
@@ -1389,14 +1391,11 @@ int search_binary_handler(struct linux_binprm *bprm)
 	for (try=0; try<2; try++) {
 		read_lock(&binfmt_lock);
 		list_for_each_entry(fmt, &formats, lh) {
-			int (*fn)(struct linux_binprm *) = fmt->load_binary;
-			if (!fn)
-				continue;
 			if (!try_module_get(fmt->module))
 				continue;
 			read_unlock(&binfmt_lock);
 			bprm->recursion_depth++;
-			retval = fn(bprm);
+			retval = fmt->load_binary(bprm);
 			bprm->recursion_depth--;
 			if (retval >= 0) {
 				put_binfmt(fmt);

From cb7b6b1cbc20a970c7124efae1c2478155604b54 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:44 -0700
Subject: [PATCH 232/303] exec: cleanup the CONFIG_MODULES logic

search_binary_handler() uses "for (try=0; try<2; try++)" to avoid "goto"
but the code looks too complicated and horrible imho.  We still need to
check "try == 0" before request_module() and add the additional "break"
for !CONFIG_MODULES case.

Kill this loop and use a simple "bool need_retry" + "goto retry".  The
code looks much simpler and we do not even need ifdef's, gcc can optimize
out the "if (need_retry)" block if !IS_ENABLED().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 72 +++++++++++++++++++++++++------------------------------
 1 file changed, 33 insertions(+), 39 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 7b92fbfa63aa..ba357e6aea98 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1367,13 +1367,15 @@ out:
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
+#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
 /*
  * cycle the list of binary formats handler, until one recognizes the image
  */
 int search_binary_handler(struct linux_binprm *bprm)
 {
-	int try, retval;
+	bool need_retry = IS_ENABLED(CONFIG_MODULES);
 	struct linux_binfmt *fmt;
+	int retval;
 
 	/* This allows 4 levels of binfmt rewrites before failing hard. */
 	if (bprm->recursion_depth > 5)
@@ -1388,47 +1390,39 @@ int search_binary_handler(struct linux_binprm *bprm)
 		return retval;
 
 	retval = -ENOENT;
-	for (try=0; try<2; try++) {
-		read_lock(&binfmt_lock);
-		list_for_each_entry(fmt, &formats, lh) {
-			if (!try_module_get(fmt->module))
-				continue;
-			read_unlock(&binfmt_lock);
-			bprm->recursion_depth++;
-			retval = fmt->load_binary(bprm);
-			bprm->recursion_depth--;
-			if (retval >= 0) {
-				put_binfmt(fmt);
-				return retval;
-			}
-			read_lock(&binfmt_lock);
-			put_binfmt(fmt);
-			if (retval != -ENOEXEC || bprm->mm == NULL)
-				break;
-			if (!bprm->file) {
-				read_unlock(&binfmt_lock);
-				return retval;
-			}
-		}
+ retry:
+	read_lock(&binfmt_lock);
+	list_for_each_entry(fmt, &formats, lh) {
+		if (!try_module_get(fmt->module))
+			continue;
 		read_unlock(&binfmt_lock);
-#ifdef CONFIG_MODULES
-		if (retval != -ENOEXEC || bprm->mm == NULL) {
-			break;
-		} else {
-#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
-			if (printable(bprm->buf[0]) &&
-			    printable(bprm->buf[1]) &&
-			    printable(bprm->buf[2]) &&
-			    printable(bprm->buf[3]))
-				break; /* -ENOEXEC */
-			if (try)
-				break; /* -ENOEXEC */
-			request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
+		bprm->recursion_depth++;
+		retval = fmt->load_binary(bprm);
+		bprm->recursion_depth--;
+		if (retval >= 0) {
+			put_binfmt(fmt);
+			return retval;
+		}
+		read_lock(&binfmt_lock);
+		put_binfmt(fmt);
+		if (retval != -ENOEXEC || bprm->mm == NULL)
+			break;
+		if (!bprm->file) {
+			read_unlock(&binfmt_lock);
+			return retval;
 		}
-#else
-		break;
-#endif
 	}
+	read_unlock(&binfmt_lock);
+
+	if (need_retry && retval == -ENOEXEC && bprm->mm) {
+		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
+		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
+			return retval;
+		request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2));
+		need_retry = false;
+		goto retry;
+	}
+
 	return retval;
 }
 EXPORT_SYMBOL(search_binary_handler);

From 4e0621a07ea58a0dc15859be3b743bdeb194a51b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:45 -0700
Subject: [PATCH 233/303] exec: don't retry if request_module() fails

A separate one-liner for better documentation.

It doesn't make sense to retry if request_module() fails to exec
/sbin/modprobe, add the additional "request_module() < 0" check.

However, this logic still doesn't look exactly right:

1. It would be better to check "request_module() != 0", the user
   space modprobe process should report the correct exit code.
   But I didn't dare to add the user-visible change.

2. The whole ENOEXEC logic looks suboptimal. Suppose that we try
   to exec a "#!path-to-unsupported-binary" script. In this case
   request_module() + "retry" will be done twice: first by the
   "depth == 1" code, and then again by the "depth == 0" caller
   which doesn't make sense.

3. And note that in the case above bprm->buf was already changed
   by load_script()->prepare_binprm(), so this looks even more
   ugly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index ba357e6aea98..635b586de336 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1418,7 +1418,8 @@ int search_binary_handler(struct linux_binprm *bprm)
 		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
 		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
 			return retval;
-		request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2));
+		if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
+			return retval;
 		need_retry = false;
 		goto retry;
 	}

From 6b3c538f5b2cfc53cb6803ec5001bbcf8f18a98e Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 11 Sep 2013 14:24:46 -0700
Subject: [PATCH 234/303] exec: cleanup the error handling in
 search_binary_handler()

The error hanling and ret-from-loop look confusing and inconsistent.

- "retval >= 0" simply returns

- "!bprm->file" returns too but with read_unlock() because
   binfmt_lock was already re-acquired

- "retval != -ENOEXEC || bprm->mm == NULL" does "break" and
  relies on the same check after the main loop

Consolidate these checks into a single if/return statement.

need_retry still checks "retval == -ENOEXEC", but this and -ENOENT before
the main loop are not needed.  This is only for pathological and
impossible list_empty(&formats) case.

It is not clear why do we check "bprm->mm == NULL", probably this
should be removed.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Zach Levis <zml@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 635b586de336..8875dd10ae7a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1399,22 +1399,17 @@ int search_binary_handler(struct linux_binprm *bprm)
 		bprm->recursion_depth++;
 		retval = fmt->load_binary(bprm);
 		bprm->recursion_depth--;
-		if (retval >= 0) {
+		if (retval >= 0 || retval != -ENOEXEC ||
+		    bprm->mm == NULL || bprm->file == NULL) {
 			put_binfmt(fmt);
 			return retval;
 		}
 		read_lock(&binfmt_lock);
 		put_binfmt(fmt);
-		if (retval != -ENOEXEC || bprm->mm == NULL)
-			break;
-		if (!bprm->file) {
-			read_unlock(&binfmt_lock);
-			return retval;
-		}
 	}
 	read_unlock(&binfmt_lock);
 
-	if (need_retry && retval == -ENOEXEC && bprm->mm) {
+	if (need_retry && retval == -ENOEXEC) {
 		if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
 		    printable(bprm->buf[2]) && printable(bprm->buf[3]))
 			return retval;

From 80c74f6a40284c5c5d49f3b3289172bbce0b30b8 Mon Sep 17 00:00:00 2001
From: Xishi Qiu <qiuxishi@huawei.com>
Date: Wed, 11 Sep 2013 14:24:47 -0700
Subject: [PATCH 235/303] kexec: remove unnecessary return

Code can not run here forever, so remove the unnecessary return.

Signed-off-by: Xishi Qiu <qiuxishi@huawei.com>
Suggested-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Reviewed-by: Simon Horman <horms@verge.net.au>
Reviewed-by: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba745..2a74f307c5ec 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline,
 	if (first_colon && (!first_space || first_colon < first_space))
 		return parse_crashkernel_mem(ck_cmdline, system_ram,
 				crash_size, crash_base);
-	else
-		return parse_crashkernel_simple(ck_cmdline, crash_size,
-				crash_base);
 
-	return 0;
+	return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
 }
 
 /*

From be8a8d069e508d4408125e2b1471f549e7813d25 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:24:49 -0700
Subject: [PATCH 236/303] vmcore: introduce ELF header in new memory feature

For s390 we want to use /proc/vmcore for our SCSI stand-alone dump
(zfcpdump).  We have support where the first HSA_SIZE bytes are saved into
a hypervisor owned memory area (HSA) before the kdump kernel is booted.
When the kdump kernel starts, it is restricted to use only HSA_SIZE bytes.

The advantages of this mechanism are:

 * No crashkernel memory has to be defined in the old kernel.
 * Early boot problems (before kexec_load has been done) can be dumped
 * Non-Linux systems can be dumped.

We modify the s390 copy_oldmem_page() function to read from the HSA memory
if memory below HSA_SIZE bytes is requested.

Since we cannot use the kexec tool to load the kernel in this scenario,
we have to build the ELF header in the 2nd (kdump/new) kernel.

So with the following patch set we would like to introduce the new
function that the ELF header for /proc/vmcore can be created in the 2nd
kernel memory.

The following steps are done during zfcpdump execution:

1.  Production system crashes
2.  User boots a SCSI disk that has been prepared with the zfcpdump tool
3.  Hypervisor saves CPU state of boot CPU and HSA_SIZE bytes of memory into HSA
4.  Boot loader loads kernel into low memory area
5.  Kernel boots and uses only HSA_SIZE bytes of memory
6.  Kernel saves registers of non-boot CPUs
7.  Kernel does memory detection for dump memory map
8.  Kernel creates ELF header for /proc/vmcore
9.  /proc/vmcore uses this header for initialization
10. The zfcpdump user space reads /proc/vmcore to write dump to SCSI disk
    - copy_oldmem_page() copies from HSA for memory below HSA_SIZE
    - copy_oldmem_page() copies from real memory for memory above HSA_SIZE

Currently for s390 we create the ELF core header in the 2nd kernel with a
small trick.  We relocate the addresses in the ELF header in a way that
for the /proc/vmcore code it seems to be in the 1st kernel (old) memory
and the read_from_oldmem() returns the correct data.  This allows the
/proc/vmcore code to use the ELF header in the 2nd kernel.

This patch:

Exchange the old mechanism with the new and much cleaner function call
override feature that now offcially allows to create the ELF core header
in the 2nd kernel.

To use the new feature the following function have to be defined
by the architecture backend code to read from new memory:

 * elfcorehdr_alloc: Allocate ELF header
 * elfcorehdr_free: Free the memory of the ELF header
 * elfcorehdr_read: Read from ELF header
 * elfcorehdr_read_notes: Read from ELF notes

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Willeke <willeke@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c           | 61 +++++++++++++++++++++++++++++++-------
 include/linux/crash_dump.h |  6 ++++
 2 files changed, 57 insertions(+), 10 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a1a16eb97c7b..02cb3ff108bc 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -123,6 +123,36 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
 	return read;
 }
 
+/*
+ * Architectures may override this function to allocate ELF header in 2nd kernel
+ */
+int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
+{
+	return 0;
+}
+
+/*
+ * Architectures may override this function to free header
+ */
+void __weak elfcorehdr_free(unsigned long long addr)
+{}
+
+/*
+ * Architectures may override this function to read from ELF header
+ */
+ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+	return read_from_oldmem(buf, count, ppos, 0);
+}
+
+/*
+ * Architectures may override this function to read from notes sections
+ */
+ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
+{
+	return read_from_oldmem(buf, count, ppos, 0);
+}
+
 /* Read from the ELF header and then the crash dump. On error, negative value is
  * returned otherwise number of bytes read are returned.
  */
@@ -357,7 +387,7 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
 		notes_section = kmalloc(max_sz, GFP_KERNEL);
 		if (!notes_section)
 			return -ENOMEM;
-		rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+		rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
 		if (rc < 0) {
 			kfree(notes_section);
 			return rc;
@@ -444,7 +474,8 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf)
 		if (phdr_ptr->p_type != PT_NOTE)
 			continue;
 		offset = phdr_ptr->p_offset;
-		rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+		rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+					   &offset);
 		if (rc < 0)
 			return rc;
 		notes_buf += phdr_ptr->p_memsz;
@@ -536,7 +567,7 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
 		notes_section = kmalloc(max_sz, GFP_KERNEL);
 		if (!notes_section)
 			return -ENOMEM;
-		rc = read_from_oldmem(notes_section, max_sz, &offset, 0);
+		rc = elfcorehdr_read_notes(notes_section, max_sz, &offset);
 		if (rc < 0) {
 			kfree(notes_section);
 			return rc;
@@ -623,7 +654,8 @@ static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf)
 		if (phdr_ptr->p_type != PT_NOTE)
 			continue;
 		offset = phdr_ptr->p_offset;
-		rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0);
+		rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz,
+					   &offset);
 		if (rc < 0)
 			return rc;
 		notes_buf += phdr_ptr->p_memsz;
@@ -810,7 +842,7 @@ static int __init parse_crash_elf64_headers(void)
 	addr = elfcorehdr_addr;
 
 	/* Read Elf header */
-	rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0);
+	rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr);
 	if (rc < 0)
 		return rc;
 
@@ -837,7 +869,7 @@ static int __init parse_crash_elf64_headers(void)
 	if (!elfcorebuf)
 		return -ENOMEM;
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+	rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
 	if (rc < 0)
 		goto fail;
 
@@ -866,7 +898,7 @@ static int __init parse_crash_elf32_headers(void)
 	addr = elfcorehdr_addr;
 
 	/* Read Elf header */
-	rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0);
+	rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr);
 	if (rc < 0)
 		return rc;
 
@@ -892,7 +924,7 @@ static int __init parse_crash_elf32_headers(void)
 	if (!elfcorebuf)
 		return -ENOMEM;
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0);
+	rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr);
 	if (rc < 0)
 		goto fail;
 
@@ -919,7 +951,7 @@ static int __init parse_crash_elf_headers(void)
 	int rc=0;
 
 	addr = elfcorehdr_addr;
-	rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0);
+	rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr);
 	if (rc < 0)
 		return rc;
 	if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) {
@@ -952,7 +984,14 @@ static int __init vmcore_init(void)
 {
 	int rc = 0;
 
-	/* If elfcorehdr= has been passed in cmdline, then capture the dump.*/
+	/* Allow architectures to allocate ELF header in 2nd kernel */
+	rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size);
+	if (rc)
+		return rc;
+	/*
+	 * If elfcorehdr= has been passed in cmdline or created in 2nd kernel,
+	 * then capture the dump.
+	 */
 	if (!(is_vmcore_usable()))
 		return rc;
 	rc = parse_crash_elf_headers();
@@ -960,6 +999,8 @@ static int __init vmcore_init(void)
 		pr_warn("Kdump: vmcore not initialized\n");
 		return rc;
 	}
+	elfcorehdr_free(elfcorehdr_addr);
+	elfcorehdr_addr = ELFCORE_ADDR_ERR;
 
 	proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations);
 	if (proc_vmcore)
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 37e4f8da7cdf..6571f828e313 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -12,6 +12,12 @@
 extern unsigned long long elfcorehdr_addr;
 extern unsigned long long elfcorehdr_size;
 
+extern int __weak elfcorehdr_alloc(unsigned long long *addr,
+				   unsigned long long *size);
+extern void __weak elfcorehdr_free(unsigned long long addr);
+extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos);
+extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos);
+
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);
 

From 97b0f6f9cd73ff8285835c5e295d3c4b0e2dbf78 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:24:50 -0700
Subject: [PATCH 237/303] s390/vmcore: use ELF header in new memory feature

Exchange the old relocate mechanism with the new arch function call
override mechanism that allows to create the ELF core header in the 2nd
kernel.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Willeke <willeke@de.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/crash_dump.c | 89 +++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 31 deletions(-)

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index d8f355657171..0c9a897a1fb5 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -63,6 +63,11 @@ static ssize_t copy_page_real(void *buf, void *src, size_t csize)
 	}
 }
 
+/*
+ * Pointer to ELF header in new kernel
+ */
+static void *elfcorehdr_newmem;
+
 /*
  * Copy one page from "oldmem"
  *
@@ -367,14 +372,6 @@ static int get_mem_chunk_cnt(void)
 	return cnt;
 }
 
-/*
- * Relocate pointer in order to allow vmcore code access the data
- */
-static inline unsigned long relocate(unsigned long addr)
-{
-	return OLDMEM_BASE + addr;
-}
-
 /*
  * Initialize ELF loads (new kernel)
  */
@@ -426,7 +423,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
 	ptr = nt_vmcoreinfo(ptr);
 	memset(phdr, 0, sizeof(*phdr));
 	phdr->p_type = PT_NOTE;
-	phdr->p_offset = relocate(notes_offset);
+	phdr->p_offset = notes_offset;
 	phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start);
 	phdr->p_memsz = phdr->p_filesz;
 	return ptr;
@@ -435,7 +432,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset)
 /*
  * Create ELF core header (new kernel)
  */
-static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz)
+int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
 {
 	Elf64_Phdr *phdr_notes, *phdr_loads;
 	int mem_chunk_cnt;
@@ -443,6 +440,11 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz)
 	u32 alloc_size;
 	u64 hdr_off;
 
+	if (!OLDMEM_BASE)
+		return 0;
+	/* If elfcorehdr= has been passed via cmdline, we use that one */
+	if (elfcorehdr_addr != ELFCORE_ADDR_MAX)
+		return 0;
 	mem_chunk_cnt = get_mem_chunk_cnt();
 
 	alloc_size = 0x1000 + get_cpu_cnt() * 0x300 +
@@ -460,27 +462,52 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz)
 	ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off);
 	/* Init loads */
 	hdr_off = PTR_DIFF(ptr, hdr);
-	loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off);
-	*elfcorebuf_sz = hdr_off;
-	*elfcorebuf = (void *) relocate((unsigned long) hdr);
-	BUG_ON(*elfcorebuf_sz > alloc_size);
-}
-
-/*
- * Create kdump ELF core header in new kernel, if it has not been passed via
- * the "elfcorehdr" kernel parameter
- */
-static int setup_kdump_elfcorehdr(void)
-{
-	size_t elfcorebuf_sz;
-	char *elfcorebuf;
-
-	if (!OLDMEM_BASE || is_kdump_kernel())
-		return -EINVAL;
-	s390_elf_corehdr_create(&elfcorebuf, &elfcorebuf_sz);
-	elfcorehdr_addr = (unsigned long long) elfcorebuf;
-	elfcorehdr_size = elfcorebuf_sz;
+	loads_init(phdr_loads, hdr_off);
+	*addr = (unsigned long long) hdr;
+	elfcorehdr_newmem = hdr;
+	*size = (unsigned long long) hdr_off;
+	BUG_ON(elfcorehdr_size > alloc_size);
 	return 0;
 }
 
-subsys_initcall(setup_kdump_elfcorehdr);
+/*
+ * Free ELF core header (new kernel)
+ */
+void elfcorehdr_free(unsigned long long addr)
+{
+	if (!elfcorehdr_newmem)
+		return;
+	kfree((void *)(unsigned long)addr);
+}
+
+/*
+ * Read from ELF header
+ */
+ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos)
+{
+	void *src = (void *)(unsigned long)*ppos;
+
+	src = elfcorehdr_newmem ? src : src - OLDMEM_BASE;
+	memcpy(buf, src, count);
+	*ppos += count;
+	return count;
+}
+
+/*
+ * Read from ELF notes data
+ */
+ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
+{
+	void *src = (void *)(unsigned long)*ppos;
+	int rc;
+
+	if (elfcorehdr_newmem) {
+		memcpy(buf, src, count);
+	} else {
+		rc = copy_from_oldmem(buf, src, count);
+		if (rc)
+			return rc;
+	}
+	*ppos += count;
+	return count;
+}

From 9cb218131de1c59dca9063b2efe876f053f316af Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:24:51 -0700
Subject: [PATCH 238/303] vmcore: introduce remap_oldmem_pfn_range()

For zfcpdump we can't map the HSA storage because it is only available via
a read interface.  Therefore, for the new vmcore mmap feature we have
introduce a new mechanism to create mappings on demand.

This patch introduces a new architecture function remap_oldmem_pfn_range()
that should be used to create mappings with remap_pfn_range() for oldmem
areas that can be directly mapped.  For zfcpdump this is everything
besides of the HSA memory.  For the areas that are not mapped by
remap_oldmem_pfn_range() a generic vmcore a new generic vmcore fault
handler mmap_vmcore_fault() is called.

This handler works as follows:

* Get already available or new page from page cache (find_or_create_page)
* Check if /proc/vmcore page is filled with data (PageUptodate)
* If yes:
  Return that page
* If no:
  Fill page using __vmcore_read(), set PageUptodate, and return page

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Willeke <willeke@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c           | 91 ++++++++++++++++++++++++++++++++++----
 include/linux/crash_dump.h |  3 ++
 2 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 02cb3ff108bc..d07b70a6eed5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -21,6 +21,7 @@
 #include <linux/crash_dump.h>
 #include <linux/list.h>
 #include <linux/vmalloc.h>
+#include <linux/pagemap.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include "internal.h"
@@ -153,11 +154,35 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
 	return read_from_oldmem(buf, count, ppos, 0);
 }
 
+/*
+ * Architectures may override this function to map oldmem
+ */
+int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
+				  unsigned long from, unsigned long pfn,
+				  unsigned long size, pgprot_t prot)
+{
+	return remap_pfn_range(vma, from, pfn, size, prot);
+}
+
+/*
+ * Copy to either kernel or user space
+ */
+static int copy_to(void *target, void *src, size_t size, int userbuf)
+{
+	if (userbuf) {
+		if (copy_to_user((char __user *) target, src, size))
+			return -EFAULT;
+	} else {
+		memcpy(target, src, size);
+	}
+	return 0;
+}
+
 /* Read from the ELF header and then the crash dump. On error, negative value is
  * returned otherwise number of bytes read are returned.
  */
-static ssize_t read_vmcore(struct file *file, char __user *buffer,
-				size_t buflen, loff_t *fpos)
+static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
+			     int userbuf)
 {
 	ssize_t acc = 0, tmp;
 	size_t tsz;
@@ -174,7 +199,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 	/* Read ELF core header */
 	if (*fpos < elfcorebuf_sz) {
 		tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen);
-		if (copy_to_user(buffer, elfcorebuf + *fpos, tsz))
+		if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf))
 			return -EFAULT;
 		buflen -= tsz;
 		*fpos += tsz;
@@ -192,7 +217,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 
 		tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen);
 		kaddr = elfnotes_buf + *fpos - elfcorebuf_sz;
-		if (copy_to_user(buffer, kaddr, tsz))
+		if (copy_to(buffer, kaddr, tsz, userbuf))
 			return -EFAULT;
 		buflen -= tsz;
 		*fpos += tsz;
@@ -208,7 +233,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 		if (*fpos < m->offset + m->size) {
 			tsz = min_t(size_t, m->offset + m->size - *fpos, buflen);
 			start = m->paddr + *fpos - m->offset;
-			tmp = read_from_oldmem(buffer, tsz, &start, 1);
+			tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
 			if (tmp < 0)
 				return tmp;
 			buflen -= tsz;
@@ -225,6 +250,55 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 	return acc;
 }
 
+static ssize_t read_vmcore(struct file *file, char __user *buffer,
+			   size_t buflen, loff_t *fpos)
+{
+	return __read_vmcore((__force char *) buffer, buflen, fpos, 1);
+}
+
+/*
+ * The vmcore fault handler uses the page cache and fills data using the
+ * standard __vmcore_read() function.
+ *
+ * On s390 the fault handler is used for memory regions that can't be mapped
+ * directly with remap_pfn_range().
+ */
+static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+#ifdef CONFIG_S390
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	pgoff_t index = vmf->pgoff;
+	struct page *page;
+	loff_t offset;
+	char *buf;
+	int rc;
+
+	page = find_or_create_page(mapping, index, GFP_KERNEL);
+	if (!page)
+		return VM_FAULT_OOM;
+	if (!PageUptodate(page)) {
+		offset = (loff_t) index << PAGE_CACHE_SHIFT;
+		buf = __va((page_to_pfn(page) << PAGE_SHIFT));
+		rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0);
+		if (rc < 0) {
+			unlock_page(page);
+			page_cache_release(page);
+			return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS;
+		}
+		SetPageUptodate(page);
+	}
+	unlock_page(page);
+	vmf->page = page;
+	return 0;
+#else
+	return VM_FAULT_SIGBUS;
+#endif
+}
+
+static const struct vm_operations_struct vmcore_mmap_ops = {
+	.fault = mmap_vmcore_fault,
+};
+
 /**
  * alloc_elfnotes_buf - allocate buffer for ELF note segment in
  *                      vmalloc memory
@@ -271,6 +345,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 
 	vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC);
 	vma->vm_flags |= VM_MIXEDMAP;
+	vma->vm_ops = &vmcore_mmap_ops;
 
 	len = 0;
 
@@ -312,9 +387,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 
 			tsz = min_t(size_t, m->offset + m->size - start, size);
 			paddr = m->paddr + start - m->offset;
-			if (remap_pfn_range(vma, vma->vm_start + len,
-					    paddr >> PAGE_SHIFT, tsz,
-					    vma->vm_page_prot))
+			if (remap_oldmem_pfn_range(vma, vma->vm_start + len,
+						   paddr >> PAGE_SHIFT, tsz,
+						   vma->vm_page_prot))
 				goto fail;
 			size -= tsz;
 			start += tsz;
diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h
index 6571f828e313..fe68a5a98583 100644
--- a/include/linux/crash_dump.h
+++ b/include/linux/crash_dump.h
@@ -17,6 +17,9 @@ extern int __weak elfcorehdr_alloc(unsigned long long *addr,
 extern void __weak elfcorehdr_free(unsigned long long addr);
 extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos);
 extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos);
+extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
+					 unsigned long from, unsigned long pfn,
+					 unsigned long size, pgprot_t prot);
 
 extern ssize_t copy_oldmem_page(unsigned long, char *, size_t,
 						unsigned long, int);

From 23df79da8eb97757e39af7625665c1c5cecc610b Mon Sep 17 00:00:00 2001
From: Jan Willeke <willeke@de.ibm.com>
Date: Wed, 11 Sep 2013 14:24:52 -0700
Subject: [PATCH 239/303] s390/vmcore: implement remap_oldmem_pfn_range for
 s390

Introduce the s390 specific way to map pages from oldmem.  The memory area
below OLDMEM_SIZE is mapped with offset OLDMEM_BASE.  The other old memory
is mapped directly.

Signed-off-by: Jan Willeke <willeke@de.ibm.com>
Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/kernel/crash_dump.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 0c9a897a1fb5..3e776158b330 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -98,6 +98,32 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 	return (rc == 0) ? csize : rc;
 }
 
+/*
+ * Remap "oldmem"
+ *
+ * For the kdump reserved memory this functions performs a swap operation:
+ * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE]
+ */
+int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
+			   unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	unsigned long size_old;
+	int rc;
+
+	if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) {
+		size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT));
+		rc = remap_pfn_range(vma, from,
+				     pfn + (OLDMEM_BASE >> PAGE_SHIFT),
+				     size_old, prot);
+		if (rc || size == size_old)
+			return rc;
+		size -= size_old;
+		from += size_old;
+		pfn += size_old >> PAGE_SHIFT;
+	}
+	return remap_pfn_range(vma, from, pfn, size, prot);
+}
+
 /*
  * Copy memory from old kernel
  */

From 11e376a3f9ffa85bf444b65df5326612b083c501 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:24:53 -0700
Subject: [PATCH 240/303] vmcore: enable /proc/vmcore mmap for s390

The patch "s390/vmcore: Implement remap_oldmem_pfn_range for s390" allows
now to use mmap also on s390.

So enable mmap for s390 again.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Willeke <willeke@de.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/vmcore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index d07b70a6eed5..9100d6959886 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -327,7 +327,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz)
  * regions in the 1st kernel pointed to by PT_LOAD entries) into
  * virtually contiguous user-space in ELF layout.
  */
-#if defined(CONFIG_MMU) && !defined(CONFIG_S390)
+#ifdef CONFIG_MMU
 static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
 {
 	size_t size = vma->vm_end - vma->vm_start;

From 6f79d33228fa7cf900826738a39f287cae96cd91 Mon Sep 17 00:00:00 2001
From: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:24:54 -0700
Subject: [PATCH 241/303] s390/vmcore: use vmcore for zfcpdump

Modify the s390 copy_oldmem_page() and remap_oldmem_pfn_range() function
for zfcpdump to read from the HSA memory if memory below HSA_SIZE bytes is
requested.  Otherwise real memory is used.

Signed-off-by: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: HATAYAMA Daisuke <d.hatayama@jp.fujitsu.com>
Cc: Jan Willeke <willeke@de.ibm.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/Kconfig             |   3 +-
 arch/s390/include/asm/sclp.h  |   1 +
 arch/s390/kernel/crash_dump.c | 122 +++++++++++++++++++++++++++++-----
 drivers/s390/char/zcore.c     |   6 +-
 4 files changed, 110 insertions(+), 22 deletions(-)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index fb2723e8ba65..3ec272859e1e 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -526,6 +526,7 @@ config CRASH_DUMP
 	bool "kernel crash dumps"
 	depends on 64BIT && SMP
 	select KEXEC
+	select ZFCPDUMP
 	help
 	  Generate crash dump after being started by kexec.
 	  Crash dump kernels are loaded in the main kernel with kexec-tools
@@ -536,7 +537,7 @@ config CRASH_DUMP
 config ZFCPDUMP
 	def_bool n
 	prompt "zfcpdump support"
-	select SMP
+	depends on SMP
 	help
 	  Select this option if you want to build an zfcpdump enabled kernel.
 	  Refer to <file:Documentation/s390/zfcpdump.txt> for more details on this.
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index 06a136136047..7dc7f9c63b65 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -56,5 +56,6 @@ bool sclp_has_linemode(void);
 bool sclp_has_vt220(void);
 int sclp_pci_configure(u32 fid);
 int sclp_pci_deconfigure(u32 fid);
+int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode);
 
 #endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 3e776158b330..c84f33d51f7b 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -16,6 +16,7 @@
 #include <asm/os_info.h>
 #include <asm/elf.h>
 #include <asm/ipl.h>
+#include <asm/sclp.h>
 
 #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y)))
 #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y)))
@@ -69,22 +70,41 @@ static ssize_t copy_page_real(void *buf, void *src, size_t csize)
 static void *elfcorehdr_newmem;
 
 /*
- * Copy one page from "oldmem"
+ * Copy one page from zfcpdump "oldmem"
+ *
+ * For pages below ZFCPDUMP_HSA_SIZE memory from the HSA is copied. Otherwise
+ * real memory copy is used.
+ */
+static ssize_t copy_oldmem_page_zfcpdump(char *buf, size_t csize,
+					 unsigned long src, int userbuf)
+{
+	int rc;
+
+	if (src < ZFCPDUMP_HSA_SIZE) {
+		rc = memcpy_hsa(buf, src, csize, userbuf);
+	} else {
+		if (userbuf)
+			rc = copy_to_user_real((void __force __user *) buf,
+					       (void *) src, csize);
+		else
+			rc = memcpy_real(buf, (void *) src, csize);
+	}
+	return rc ? rc : csize;
+}
+
+/*
+ * Copy one page from kdump "oldmem"
  *
  * For the kdump reserved memory this functions performs a swap operation:
  *  - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE].
  *  - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE]
  */
-ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
-			 size_t csize, unsigned long offset, int userbuf)
+static ssize_t copy_oldmem_page_kdump(char *buf, size_t csize,
+				      unsigned long src, int userbuf)
+
 {
-	unsigned long src;
 	int rc;
 
-	if (!csize)
-		return 0;
-
-	src = (pfn << PAGE_SHIFT) + offset;
 	if (src < OLDMEM_SIZE)
 		src += OLDMEM_BASE;
 	else if (src > OLDMEM_BASE &&
@@ -95,17 +115,35 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
 				       (void *) src, csize);
 	else
 		rc = copy_page_real(buf, (void *) src, csize);
-	return (rc == 0) ? csize : rc;
+	return (rc == 0) ? rc : csize;
 }
 
 /*
- * Remap "oldmem"
+ * Copy one page from "oldmem"
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize,
+			 unsigned long offset, int userbuf)
+{
+	unsigned long src;
+
+	if (!csize)
+		return 0;
+	src = (pfn << PAGE_SHIFT) + offset;
+	if (OLDMEM_BASE)
+		return copy_oldmem_page_kdump(buf, csize, src, userbuf);
+	else
+		return copy_oldmem_page_zfcpdump(buf, csize, src, userbuf);
+}
+
+/*
+ * Remap "oldmem" for kdump
  *
  * For the kdump reserved memory this functions performs a swap operation:
  * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE]
  */
-int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
-			   unsigned long pfn, unsigned long size, pgprot_t prot)
+static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma,
+					unsigned long from, unsigned long pfn,
+					unsigned long size, pgprot_t prot)
 {
 	unsigned long size_old;
 	int rc;
@@ -124,6 +162,43 @@ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	return remap_pfn_range(vma, from, pfn, size, prot);
 }
 
+/*
+ * Remap "oldmem" for zfcpdump
+ *
+ * We only map available memory above ZFCPDUMP_HSA_SIZE. Memory below
+ * ZFCPDUMP_HSA_SIZE is read on demand using the copy_oldmem_page() function.
+ */
+static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma,
+					   unsigned long from,
+					   unsigned long pfn,
+					   unsigned long size, pgprot_t prot)
+{
+	unsigned long size_hsa;
+
+	if (pfn < ZFCPDUMP_HSA_SIZE >> PAGE_SHIFT) {
+		size_hsa = min(size, ZFCPDUMP_HSA_SIZE - (pfn << PAGE_SHIFT));
+		if (size == size_hsa)
+			return 0;
+		size -= size_hsa;
+		from += size_hsa;
+		pfn += size_hsa >> PAGE_SHIFT;
+	}
+	return remap_pfn_range(vma, from, pfn, size, prot);
+}
+
+/*
+ * Remap "oldmem" for kdump or zfcpdump
+ */
+int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from,
+			   unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+	if (OLDMEM_BASE)
+		return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot);
+	else
+		return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size,
+						       prot);
+}
+
 /*
  * Copy memory from old kernel
  */
@@ -132,11 +207,21 @@ int copy_from_oldmem(void *dest, void *src, size_t count)
 	unsigned long copied = 0;
 	int rc;
 
-	if ((unsigned long) src < OLDMEM_SIZE) {
-		copied = min(count, OLDMEM_SIZE - (unsigned long) src);
-		rc = memcpy_real(dest, src + OLDMEM_BASE, copied);
-		if (rc)
-			return rc;
+	if (OLDMEM_BASE) {
+		if ((unsigned long) src < OLDMEM_SIZE) {
+			copied = min(count, OLDMEM_SIZE - (unsigned long) src);
+			rc = memcpy_real(dest, src + OLDMEM_BASE, copied);
+			if (rc)
+				return rc;
+		}
+	} else {
+		if ((unsigned long) src < ZFCPDUMP_HSA_SIZE) {
+			copied = min(count,
+				     ZFCPDUMP_HSA_SIZE - (unsigned long) src);
+			rc = memcpy_hsa(dest, (unsigned long) src, copied, 0);
+			if (rc)
+				return rc;
+		}
 	}
 	return memcpy_real(dest + copied, src + copied, count - copied);
 }
@@ -466,7 +551,8 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size)
 	u32 alloc_size;
 	u64 hdr_off;
 
-	if (!OLDMEM_BASE)
+	/* If we are not in kdump or zfcpdump mode return */
+	if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP)
 		return 0;
 	/* If elfcorehdr= has been passed via cmdline, we use that one */
 	if (elfcorehdr_addr != ELFCORE_ADDR_MAX)
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 9e5e14686e75..794820a123d0 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -30,8 +30,8 @@
 
 #define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x)
 
-#define TO_USER		0
-#define TO_KERNEL	1
+#define TO_USER		1
+#define TO_KERNEL	0
 #define CHUNK_INFO_SIZE	34 /* 2 16-byte char, each followed by blank */
 
 enum arch_id {
@@ -73,7 +73,7 @@ static struct ipl_parameter_block *ipl_block;
  * @count: Size of buffer, which should be copied
  * @mode:  Either TO_KERNEL or TO_USER
  */
-static int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode)
+int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode)
 {
 	int offs, blk_num;
 	static char buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));

From c2ebdc2439f50c049fd362bb225aaf78fe8e4cb8 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:24:55 -0700
Subject: [PATCH 242/303] partitions/efi: use lba-aware partition records

The kernel's GPT implementation currently uses the generic 'struct
partition' type for dealing with legacy MBR partition records.  While this
is is useful for disklabels that we designed for CHS addressing, such as
msdos, it doesn't adapt well to newer standards that use LBA instead, such
as GUID partition tables.  Furthermore, these generic partition structures
do not have all the required fields to properly follow the UEFI specs.

While a CHS address can be translated to LBA, it's much simpler and
cleaner to just replace the partition type.  This patch adds a new
'gpt_record' type that is fully compliant with EFI and will allow, in the
next patches, to add more checks to properly verify a protective MBR,
which is paramount to probing a device that makes use of GPT.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c |  9 ++++-----
 block/partitions/efi.h | 16 +++++++++++++++-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index c85fc895ecdb..bd8fb22b2109 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -149,12 +149,11 @@ static u64 last_lba(struct block_device *bdev)
 		       bdev_logical_block_size(bdev)) - 1ULL;
 }
 
-static inline int
-pmbr_part_valid(struct partition *part)
+static inline int pmbr_part_valid(gpt_mbr_record *part)
 {
-        if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT &&
-            le32_to_cpu(part->start_sect) == 1UL)
-                return 1;
+	if (part->os_type == EFI_PMBR_OSTYPE_EFI_GPT &&
+	    le32_to_cpu(part->start_sector) == 1UL)
+		return 1;
         return 0;
 }
 
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index b69ab729558f..e645ecb35bf3 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -101,11 +101,25 @@ typedef struct _gpt_entry {
 	efi_char16_t partition_name[72 / sizeof (efi_char16_t)];
 } __attribute__ ((packed)) gpt_entry;
 
+typedef struct _gpt_mbr_record {
+	u8	boot_indicator; /* unused by EFI, set to 0x80 for bootable */
+	u8	start_head;     /* unused by EFI, pt start in CHS */
+	u8	start_sector;   /* unused by EFI, pt start in CHS */
+	u8	start_track;
+	u8	os_type;        /* EFI and legacy non-EFI OS types */
+	u8	end_head;       /* unused by EFI, pt end in CHS */
+	u8	end_sector;     /* unused by EFI, pt end in CHS */
+	u8	end_track;      /* unused by EFI, pt end in CHS */
+	__le32	starting_lba;   /* used by EFI - start addr of the on disk pt */
+	__le32	size_in_lba;    /* used by EFI - size of pt in LBA */
+} __packed gpt_mbr_record;
+
+
 typedef struct _legacy_mbr {
 	u8 boot_code[440];
 	__le32 unique_mbr_signature;
 	__le16 unknown;
-	struct partition partition_record[4];
+	gpt_mbr_record partition_record[4];
 	__le16 signature;
 } __attribute__ ((packed)) legacy_mbr;
 

From 33afd7a7df1a1f82675857a75572cdf4a8599e9f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:24:56 -0700
Subject: [PATCH 243/303] partitions/efi: check pmbr record's starting lba

Per the UEFI Specs 2.4, June 2013, the starting lba of the partition that
has the EFI GPT (0xEE) must be set to 0x00000001 - this is obviously the
LBA of the GPT Partition Header.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index bd8fb22b2109..7a2b74f0d06f 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -151,10 +151,19 @@ static u64 last_lba(struct block_device *bdev)
 
 static inline int pmbr_part_valid(gpt_mbr_record *part)
 {
-	if (part->os_type == EFI_PMBR_OSTYPE_EFI_GPT &&
-	    le32_to_cpu(part->start_sector) == 1UL)
-		return 1;
-        return 0;
+	if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT)
+		goto invalid;
+
+	/* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */
+	if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
+		goto invalid;
+
+	if (le32_to_cpu(part->start_sector) != 1UL)
+		goto invalid;
+
+	return 1;
+invalid:
+	return 0;
 }
 
 /**

From 3e69ac344007bec5e3987ac86619e140fbc79b72 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:24:57 -0700
Subject: [PATCH 244/303] partitions/efi: do not require gpt partition to begin
 at sector 1

When detecting a valid protective MBR, the Linux kernel isn't picky about
the partition (1-4) the 0xEE is at, but, unlike other operating systems,
it does require it to begin at the second sector (sector 1).  This check,
apart from it not being enforced by UEFI, and causing Linux to potentially
fail to detect any *valid* partitions on the disk, can present problems
when dealing with hybrid MBRs[1].

For compatibility reasons, if the first partition is hybridized, the 0xEE
partition must be small enough to ensure that it only protects the GPT
data structures - as opposed to the the whole disk in a protective MBR.
This problem is very well described by Rod Smith[1]: where MBR-only
partitioning programs (such as older versions of fdisk) can see some of
the disk space as unallocated, thus loosing the purpose of the 0xEE
partition's protection of GPT data structures.

By dropping this check, this patch enables Linux to be more flexible when
probing for GPT disklabels.

[1] http://www.rodsbooks.com/gdisk/hybrid.html#reactions

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 7a2b74f0d06f..1b499dc8fc78 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -158,9 +158,6 @@ static inline int pmbr_part_valid(gpt_mbr_record *part)
 	if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
 		goto invalid;
 
-	if (le32_to_cpu(part->start_sector) != 1UL)
-		goto invalid;
-
 	return 1;
 invalid:
 	return 0;

From b05ebbbbeb67a420d06567c6b9618a9e644d6104 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:24:58 -0700
Subject: [PATCH 245/303] partitions/efi: detect hybrid MBRs

One of the biggest problems with GPT is compatibility with older, non-GPT
systems.  The problem is addressed by creating hybrid mbrs, an extension,
or variant, of the traditional protective mbr.  This contains, apart from
the 0xEE partition, up three additional primary partitions that point to
the same space marked by up to three GPT partitions.  The result is that
legacy OSs can see the three required MBR partitions and at the same time
ignore the GPT-aware partitions that protect the GPT structures.

While hybrid MBRs are hacks, workarounds and simply not part of the GPT
standard, they do exist and we have no way around them.  For instance, by
default, OSX creates a hybrid scheme when using multi-OS booting.

In order for Linux to properly discover protective MBRs, it must be made
aware of devices that have hybrid MBRs.  No functionality is changed by
this patch, just a debug message informing the user of the MBR scheme that
is being used.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 74 ++++++++++++++++++++++++++++++------------
 block/partitions/efi.h |  3 ++
 2 files changed, 56 insertions(+), 21 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 1b499dc8fc78..e3cb4f19cf6d 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -158,7 +158,7 @@ static inline int pmbr_part_valid(gpt_mbr_record *part)
 	if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA)
 		goto invalid;
 
-	return 1;
+	return GPT_MBR_PROTECTIVE;
 invalid:
 	return 0;
 }
@@ -167,21 +167,48 @@ invalid:
  * is_pmbr_valid(): test Protective MBR for validity
  * @mbr: pointer to a legacy mbr structure
  *
- * Description: Returns 1 if PMBR is valid, 0 otherwise.
- * Validity depends on two things:
+ * Description: Checks for a valid protective or hybrid
+ * master boot record (MBR). The validity of a pMBR depends
+ * on all of the following properties:
  *  1) MSDOS signature is in the last two bytes of the MBR
  *  2) One partition of type 0xEE is found
+ *
+ * In addition, a hybrid MBR will have up to three additional
+ * primary partitions, which point to the same space that's
+ * marked out by up to three GPT partitions.
+ *
+ * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
+ * GPT_MBR_HYBRID depending on the device layout.
  */
-static int
-is_pmbr_valid(legacy_mbr *mbr)
+static int is_pmbr_valid(legacy_mbr *mbr)
 {
-	int i;
+	int i, ret = 0; /* invalid by default */
+
 	if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
-                return 0;
+		goto done;
+
+	for (i = 0; i < 4; i++) {
+		ret = pmbr_part_valid(&mbr->partition_record[i]);
+		if (ret == GPT_MBR_PROTECTIVE) {
+			/*
+			 * Ok, we at least know that there's a protective MBR,
+			 * now check if there are other partition types for
+			 * hybrid MBR.
+			 */
+			goto check_hybrid;
+		}
+	}
+
+	if (ret != GPT_MBR_PROTECTIVE)
+		goto done;
+check_hybrid:
 	for (i = 0; i < 4; i++)
-		if (pmbr_part_valid(&mbr->partition_record[i]))
-                        return 1;
-	return 0;
+		if ((mbr->partition_record[i].os_type !=
+			EFI_PMBR_OSTYPE_EFI_GPT) &&
+		    (mbr->partition_record[i].os_type != 0x00))
+			ret = GPT_MBR_HYBRID;
+done:
+	return ret;
 }
 
 /**
@@ -548,17 +575,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 
 	lastlba = last_lba(state->bdev);
         if (!force_gpt) {
-                /* This will be added to the EFI Spec. per Intel after v1.02. */
-                legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL);
-                if (legacymbr) {
-                        read_lba(state, 0, (u8 *) legacymbr,
-				 sizeof (*legacymbr));
-                        good_pmbr = is_pmbr_valid(legacymbr);
-                        kfree(legacymbr);
-                }
-                if (!good_pmbr)
-                        goto fail;
-        }
+		/* This will be added to the EFI Spec. per Intel after v1.02. */
+		legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
+		if (!legacymbr)
+			goto fail;
+
+		read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
+		good_pmbr = is_pmbr_valid(legacymbr);
+		kfree(legacymbr);
+
+		if (!good_pmbr)
+			goto fail;
+
+		pr_debug("Device has a %s MBR\n",
+			 good_pmbr == GPT_MBR_PROTECTIVE ?
+						"protective" : "hybrid");
+	}
 
 	good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA,
 				 &pgpt, &pptes);
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index e645ecb35bf3..7fef625c04de 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -37,6 +37,9 @@
 #define EFI_PMBR_OSTYPE_EFI 0xEF
 #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE
 
+#define GPT_MBR_PROTECTIVE  1
+#define GPT_MBR_HYBRID      2
+
 #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL
 #define GPT_HEADER_REVISION_V1 0x00010000
 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1

From 27a7c642174eaec627f6a3a254035bf8abd02c5e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:25:00 -0700
Subject: [PATCH 246/303] partitions/efi: account for pmbr size in lba

The partition that has the 0xEE (GPT protective), must have the size in
lba field set to the lesser of the size of the disk minus one or
0xFFFFFFFF for larger disks.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index e3cb4f19cf6d..b028af688361 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -166,6 +166,7 @@ invalid:
 /**
  * is_pmbr_valid(): test Protective MBR for validity
  * @mbr: pointer to a legacy mbr structure
+ * @total_sectors: amount of sectors in the device
  *
  * Description: Checks for a valid protective or hybrid
  * master boot record (MBR). The validity of a pMBR depends
@@ -180,9 +181,9 @@ invalid:
  * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or
  * GPT_MBR_HYBRID depending on the device layout.
  */
-static int is_pmbr_valid(legacy_mbr *mbr)
+static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors)
 {
-	int i, ret = 0; /* invalid by default */
+	int i, part = 0, ret = 0; /* invalid by default */
 
 	if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE)
 		goto done;
@@ -190,6 +191,7 @@ static int is_pmbr_valid(legacy_mbr *mbr)
 	for (i = 0; i < 4; i++) {
 		ret = pmbr_part_valid(&mbr->partition_record[i]);
 		if (ret == GPT_MBR_PROTECTIVE) {
+			part = i;
 			/*
 			 * Ok, we at least know that there's a protective MBR,
 			 * now check if there are other partition types for
@@ -207,6 +209,18 @@ check_hybrid:
 			EFI_PMBR_OSTYPE_EFI_GPT) &&
 		    (mbr->partition_record[i].os_type != 0x00))
 			ret = GPT_MBR_HYBRID;
+
+	/*
+	 * Protective MBRs take up the lesser of the whole disk
+	 * or 2 TiB (32bit LBA), ignoring the rest of the disk.
+	 *
+	 * Hybrid MBRs do not necessarily comply with this.
+	 */
+	if (ret == GPT_MBR_PROTECTIVE) {
+		if (le32_to_cpu(mbr->partition_record[part].size_in_lba) !=
+		    min((uint32_t) total_sectors - 1, 0xFFFFFFFF))
+			ret = 0;
+	}
 done:
 	return ret;
 }
@@ -568,6 +582,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 	gpt_header *pgpt = NULL, *agpt = NULL;
 	gpt_entry *pptes = NULL, *aptes = NULL;
 	legacy_mbr *legacymbr;
+	sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
 	u64 lastlba;
 
 	if (!ptes)
@@ -581,7 +596,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
 			goto fail;
 
 		read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr));
-		good_pmbr = is_pmbr_valid(legacymbr);
+		good_pmbr = is_pmbr_valid(legacymbr, total_sectors);
 		kfree(legacymbr);
 
 		if (!good_pmbr)

From aa054bc93743ecce3a27f1655d59674dabc71a54 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:25:01 -0700
Subject: [PATCH 247/303] partitions/efi: compare first and last usable LBAs

When verifying GPT header integrity, make sure that first usable LBA is
smaller than last usable LBA.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index b028af688361..de9f9bfa24bc 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -410,7 +410,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
 			 (unsigned long long)lastlba);
 		goto fail;
 	}
-
+	if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) {
+		pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n",
+			 (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba),
+			 (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba));
+		goto fail;
+	}
 	/* Check that sizeof_partition_entry has the correct value */
 	if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) {
 		pr_debug("GUID Partitition Entry Size check failed.\n");

From 08009b30a71d9a7c252c4bd677dbd496af9dd1a2 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:25:02 -0700
Subject: [PATCH 248/303] partitions/efi: delete annoying emacs style comments

I love emacs, but these settings for coding style are annoying when trying
to open the efi.h file.  More important, we already have checkpatch for
that.

Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.h | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index 7fef625c04de..4efcafba7e64 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -130,22 +130,3 @@ typedef struct _legacy_mbr {
 extern int efi_partition(struct parsed_partitions *state);
 
 #endif
-
-/*
- * Overrides for Emacs so that we follow Linus's tabbing style.
- * Emacs will notice this stuff at the end of the file and automatically
- * adjust the settings for this buffer only.  This must remain at the end
- * of the file.
- * --------------------------------------------------------------------------
- * Local variables:
- * c-indent-level: 4 
- * c-brace-imaginary-offset: 0
- * c-brace-offset: -4
- * c-argdecl-indent: 4
- * c-label-offset: -4
- * c-continued-statement-offset: 4
- * c-continued-brace-offset: 0
- * indent-tabs-mode: nil
- * tab-width: 8
- * End:
- */

From 70f637e90ea96187530365eb1ddff8d483ba460e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr@hp.com>
Date: Wed, 11 Sep 2013 14:25:03 -0700
Subject: [PATCH 249/303] partitions/efi: some style cleanups

Trivial coding style cleanups - still plenty left.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Karel Zak <kzak@redhat.com>
Acked-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index de9f9bfa24bc..0df535fac0aa 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -25,6 +25,9 @@
  * TODO:
  *
  * Changelog:
+ * Mon August 5th, 2013 Davidlohr Bueso <davidlohr@hp.com>
+ * - detect hybrid MBRs, tighter pMBR checking & cleanups.
+ *
  * Mon Nov 09 2004 Matt Domsch <Matt_Domsch@dell.com>
  * - test for valid PMBR and valid PGPT before ever reading
  *   AGPT, allow override with 'gpt' kernel command line option.
@@ -289,8 +292,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
 		return NULL;
 
 	if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba),
-                     (u8 *) pte,
-		     count) < count) {
+			(u8 *) pte, count) < count) {
 		kfree(pte);
                 pte=NULL;
 		return NULL;
@@ -633,11 +635,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
                 *ptes = pptes;
                 kfree(agpt);
                 kfree(aptes);
-                if (!good_agpt) {
-                        printk(KERN_WARNING 
-			       "Alternate GPT is invalid, "
-                               "using primary GPT.\n");
-                }
+		if (!good_agpt)
+                        printk(KERN_WARNING "Alternate GPT is invalid, using primary GPT.\n");
                 return 1;
         }
         else if (good_agpt) {
@@ -645,8 +644,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
                 *ptes = aptes;
                 kfree(pgpt);
                 kfree(pptes);
-                printk(KERN_WARNING 
-                       "Primary GPT is invalid, using alternate GPT.\n");
+		printk(KERN_WARNING "Primary GPT is invalid, using alternate GPT.\n");
                 return 1;
         }
 
@@ -708,8 +706,7 @@ int efi_partition(struct parsed_partitions *state)
 		put_partition(state, i+1, start * ssz, size * ssz);
 
 		/* If this is a RAID volume, tell md */
-		if (!efi_guidcmp(ptes[i].partition_type_guid,
-				 PARTITION_LINUX_RAID_GUID))
+		if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID))
 			state->parts[i + 1].flags = ADDPART_FLAG_RAID;
 
 		info = &state->parts[i + 1].info;

From b4bc4a18a226f46fec4ef47f2df28ea209db8b5d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 11 Sep 2013 14:25:04 -0700
Subject: [PATCH 250/303] block/partitions/efi.c: consistently use pr_foo()

Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Karel Zak <kzak@redhat.com>
Cc: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/partitions/efi.c | 45 ++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index 0df535fac0aa..1a5ec9a03c00 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -482,44 +482,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 	if (!pgpt || !agpt)
 		return;
 	if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) {
-		printk(KERN_WARNING
-		       "GPT:Primary header LBA != Alt. header alternate_lba\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->my_lba),
                        (unsigned long long)le64_to_cpu(agpt->alternate_lba));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) {
-		printk(KERN_WARNING
-		       "GPT:Primary header alternate_lba != Alt. header my_lba\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->alternate_lba),
                        (unsigned long long)le64_to_cpu(agpt->my_lba));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->first_usable_lba) !=
             le64_to_cpu(agpt->first_usable_lba)) {
-		printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:first_usable_lbas don't match.\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->first_usable_lba),
                        (unsigned long long)le64_to_cpu(agpt->first_usable_lba));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->last_usable_lba) !=
             le64_to_cpu(agpt->last_usable_lba)) {
-		printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:last_usable_lbas don't match.\n");
+		pr_warn("GPT:%lld != %lld\n",
 		       (unsigned long long)le64_to_cpu(pgpt->last_usable_lba),
                        (unsigned long long)le64_to_cpu(agpt->last_usable_lba));
 		error_found++;
 	}
 	if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) {
-		printk(KERN_WARNING "GPT:disk_guids don't match.\n");
+		pr_warn("GPT:disk_guids don't match.\n");
 		error_found++;
 	}
 	if (le32_to_cpu(pgpt->num_partition_entries) !=
             le32_to_cpu(agpt->num_partition_entries)) {
-		printk(KERN_WARNING "GPT:num_partition_entries don't match: "
+		pr_warn("GPT:num_partition_entries don't match: "
 		       "0x%x != 0x%x\n",
 		       le32_to_cpu(pgpt->num_partition_entries),
 		       le32_to_cpu(agpt->num_partition_entries));
@@ -527,8 +525,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 	}
 	if (le32_to_cpu(pgpt->sizeof_partition_entry) !=
             le32_to_cpu(agpt->sizeof_partition_entry)) {
-		printk(KERN_WARNING
-		       "GPT:sizeof_partition_entry values don't match: "
+		pr_warn("GPT:sizeof_partition_entry values don't match: "
 		       "0x%x != 0x%x\n",
                        le32_to_cpu(pgpt->sizeof_partition_entry),
 		       le32_to_cpu(agpt->sizeof_partition_entry));
@@ -536,34 +533,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba)
 	}
 	if (le32_to_cpu(pgpt->partition_entry_array_crc32) !=
             le32_to_cpu(agpt->partition_entry_array_crc32)) {
-		printk(KERN_WARNING
-		       "GPT:partition_entry_array_crc32 values don't match: "
+		pr_warn("GPT:partition_entry_array_crc32 values don't match: "
 		       "0x%x != 0x%x\n",
                        le32_to_cpu(pgpt->partition_entry_array_crc32),
 		       le32_to_cpu(agpt->partition_entry_array_crc32));
 		error_found++;
 	}
 	if (le64_to_cpu(pgpt->alternate_lba) != lastlba) {
-		printk(KERN_WARNING
-		       "GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n");
+		pr_warn("GPT:%lld != %lld\n",
 			(unsigned long long)le64_to_cpu(pgpt->alternate_lba),
 			(unsigned long long)lastlba);
 		error_found++;
 	}
 
 	if (le64_to_cpu(agpt->my_lba) != lastlba) {
-		printk(KERN_WARNING
-		       "GPT:Alternate GPT header not at the end of the disk.\n");
-		printk(KERN_WARNING "GPT:%lld != %lld\n",
+		pr_warn("GPT:Alternate GPT header not at the end of the disk.\n");
+		pr_warn("GPT:%lld != %lld\n",
 			(unsigned long long)le64_to_cpu(agpt->my_lba),
 			(unsigned long long)lastlba);
 		error_found++;
 	}
 
 	if (error_found)
-		printk(KERN_WARNING
-		       "GPT: Use GNU Parted to correct GPT errors.\n");
+		pr_warn("GPT: Use GNU Parted to correct GPT errors.\n");
 	return;
 }
 
@@ -636,7 +629,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
                 kfree(agpt);
                 kfree(aptes);
 		if (!good_agpt)
-                        printk(KERN_WARNING "Alternate GPT is invalid, using primary GPT.\n");
+                        pr_warn("Alternate GPT is invalid, using primary GPT.\n");
                 return 1;
         }
         else if (good_agpt) {
@@ -644,7 +637,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
                 *ptes = aptes;
                 kfree(pgpt);
                 kfree(pptes);
-		printk(KERN_WARNING "Primary GPT is invalid, using alternate GPT.\n");
+		pr_warn("Primary GPT is invalid, using alternate GPT.\n");
                 return 1;
         }
 

From 9dee5c51516d2c3fff22633c1272c5652e68075a Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:25:10 -0700
Subject: [PATCH 251/303] rbtree: add postorder iteration functions

Postorder iteration yields all of a node's children prior to yielding the
node itself, and this particular implementation also avoids examining the
leaf links in a node after that node has been yielded.

In what I expect will be its most common usage, postorder iteration allows
the deletion of every node in an rbtree without modifying the rbtree nodes
(no _requirement_ that they be nulled) while avoiding referencing child
nodes after they have been "deleted" (most commonly, freed).

I have only updated zswap to use this functionality at this point, but
numerous bits of code (most notably in the filesystem drivers) use a hand
rolled postorder iteration that NULLs child links as it traverses the
tree.  Each of those instances could be replaced with this common
implementation.

1 & 2 add rbtree postorder iteration functions.
3 adds testing of the iteration to the rbtree runtime tests
4 allows building the rbtree runtime tests as builtins
5 updates zswap.

This patch:

Add postorder iteration functions for rbtree.  These are useful for safely
freeing an entire rbtree without modifying the tree at all.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Reviewed-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h |  4 ++++
 lib/rbtree.c           | 40 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index 0022c1bb1e26..c467151e9950 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -68,6 +68,10 @@ extern struct rb_node *rb_prev(const struct rb_node *);
 extern struct rb_node *rb_first(const struct rb_root *);
 extern struct rb_node *rb_last(const struct rb_root *);
 
+/* Postorder iteration - always visit the parent after its children */
+extern struct rb_node *rb_first_postorder(const struct rb_root *);
+extern struct rb_node *rb_next_postorder(const struct rb_node *);
+
 /* Fast replacement of a single node without remove/rebalance/add/rebalance */
 extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
 			    struct rb_root *root);
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c0e31fe2fabf..65f4effd117f 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -518,3 +518,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 	*new = *victim;
 }
 EXPORT_SYMBOL(rb_replace_node);
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+EXPORT_SYMBOL(rb_next_postorder);
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
+EXPORT_SYMBOL(rb_first_postorder);

From 2b529089257705499207ce7da9d0e3ae26a844ba Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:25:11 -0700
Subject: [PATCH 252/303] rbtree: add rbtree_postorder_for_each_entry_safe()
 helper

Because deletion (of the entire tree) is a relatively common use of the
rbtree_postorder iteration, and because doing it safely means fiddling
with temporary storage, provide a helper to simplify postorder rbtree
iteration.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Reviewed-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rbtree.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h
index c467151e9950..aa870a4ddf54 100644
--- a/include/linux/rbtree.h
+++ b/include/linux/rbtree.h
@@ -85,4 +85,22 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
 	*rb_link = node;
 }
 
+/**
+ * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of
+ * given type safe against removal of rb_node entry
+ *
+ * @pos:	the 'type *' to use as a loop cursor.
+ * @n:		another 'type *' to use as temporary storage
+ * @root:	'rb_root *' of the rbtree.
+ * @field:	the name of the rb_node field within 'type'.
+ */
+#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \
+	for (pos = rb_entry(rb_first_postorder(root), typeof(*pos), field),\
+		n = rb_entry(rb_next_postorder(&pos->field), \
+			typeof(*pos), field); \
+	     &pos->field; \
+	     pos = n, \
+		n = rb_entry(rb_next_postorder(&pos->field), \
+			typeof(*pos), field))
+
 #endif	/* _LINUX_RBTREE_H */

From a791a62fdf288b2658646e2052400d456874790e Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:25:17 -0700
Subject: [PATCH 253/303] rbtree_test: add test for postorder iteration

Just check that we examine all nodes in the tree for the postorder
iteration.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Reviewed-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/rbtree_test.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 122f02f9941b..31dd4ccd3baa 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -114,6 +114,16 @@ static int black_path_count(struct rb_node *rb)
 	return count;
 }
 
+static void check_postorder(int nr_nodes)
+{
+	struct rb_node *rb;
+	int count = 0;
+	for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb))
+		count++;
+
+	WARN_ON_ONCE(count != nr_nodes);
+}
+
 static void check(int nr_nodes)
 {
 	struct rb_node *rb;
@@ -136,6 +146,8 @@ static void check(int nr_nodes)
 
 	WARN_ON_ONCE(count != nr_nodes);
 	WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
+
+	check_postorder(nr_nodes);
 }
 
 static void check_augmented(int nr_nodes)

From 7c993e11aa59d9d1cefbd6acc8d84f2d8d46545a Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:25:19 -0700
Subject: [PATCH 254/303] rbtree: allow tests to run as builtin

No reason require rbtree test code to be a module, allow it to be builtin
(streamlines my development process)

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Reviewed-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 652bea9054f0..c9eef36739a9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1461,7 +1461,7 @@ config BACKTRACE_SELF_TEST
 
 config RBTREE_TEST
 	tristate "Red-Black tree test"
-	depends on m && DEBUG_KERNEL
+	depends on DEBUG_KERNEL
 	help
 	  A benchmark measuring the performance of the rbtree library.
 	  Also includes rbtree invariant checks.

From 0bd42136f7ae4ea1375da34c32838fb35eee8c59 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Wed, 11 Sep 2013 14:25:33 -0700
Subject: [PATCH 255/303] mm/zswap: use postorder iteration when destroying
 rbtree

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Reviewed-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index efed4c8b7f5b..841e35f1db22 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 static void zswap_frontswap_invalidate_area(unsigned type)
 {
 	struct zswap_tree *tree = zswap_trees[type];
-	struct rb_node *node;
-	struct zswap_entry *entry;
+	struct zswap_entry *entry, *n;
 
 	if (!tree)
 		return;
 
 	/* walk the tree and free everything */
 	spin_lock(&tree->lock);
-	/*
-	 * TODO: Even though this code should not be executed because
-	 * the try_to_unuse() in swapoff should have emptied the tree,
-	 * it is very wasteful to rebalance the tree after every
-	 * removal when we are freeing the whole tree.
-	 *
-	 * If post-order traversal code is ever added to the rbtree
-	 * implementation, it should be used here.
-	 */
-	while ((node = rb_first(&tree->rbroot))) {
-		entry = rb_entry(node, struct zswap_entry, rbnode);
-		rb_erase(&entry->rbnode, &tree->rbroot);
+	rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
 		zbud_free(tree->pool, entry->handle);
 		zswap_entry_cache_free(entry);
 		atomic_dec(&zswap_stored_pages);

From 190519cd30884215a63ed875ac074dc97a602522 Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:39 -0700
Subject: [PATCH 256/303] aoe: create and destroy debugfs directory for aoe

This series adds the debugging information that the coraid.com-distributed
aoe driver exports via sysfs, but instead of sysfs, it uses debugfs.

With these patches applied, even without AoE targets on the network, KEDR
reports new possible memory leaks, but these are from callers outside the
aoe driver that have used aoe_devnode to get the name of the character
devices through the aoe_class->devnode callback, and I believe they're
responsible for freeing that memory.

This patch:

Create and destroy the debugfs directory.

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoeblk.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 916d9ed5c8aa..cb508b754377 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -17,11 +17,13 @@
 #include <linux/mutex.h>
 #include <linux/export.h>
 #include <linux/moduleparam.h>
+#include <linux/debugfs.h>
 #include <scsi/sg.h>
 #include "aoe.h"
 
 static DEFINE_MUTEX(aoeblk_mutex);
 static struct kmem_cache *buf_pool_cache;
+static struct dentry *aoe_debugfs_dir;
 
 /* GPFS needs a larger value than the default. */
 static int aoe_maxsectors;
@@ -351,6 +353,8 @@ err:
 void
 aoeblk_exit(void)
 {
+	debugfs_remove_recursive(aoe_debugfs_dir);
+	aoe_debugfs_dir = NULL;
 	kmem_cache_destroy(buf_pool_cache);
 }
 
@@ -362,7 +366,11 @@ aoeblk_init(void)
 					   0, 0, NULL);
 	if (buf_pool_cache == NULL)
 		return -ENOMEM;
-
+	aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
+	if (IS_ERR_OR_NULL(aoe_debugfs_dir)) {
+		pr_info("aoe: cannot create debugfs directory\n");
+		aoe_debugfs_dir = NULL;
+	}
 	return 0;
 }
 

From e8866cf2b90f3a29859d2113c0fd23daf189c282 Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:40 -0700
Subject: [PATCH 257/303] aoe: add AoE-target files to debugfs

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoe.h    |  2 ++
 drivers/block/aoe/aoeblk.c | 35 +++++++++++++++++++++++++++++++++++
 drivers/block/aoe/aoedev.c |  1 +
 3 files changed, 38 insertions(+)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 025c41d3cb33..b1f24c5a6bd1 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -169,6 +169,7 @@ struct aoedev {
 	ulong ref;
 	struct work_struct work;/* disk create work struct */
 	struct gendisk *gd;
+	struct dentry *debugfs;
 	struct request_queue *blkq;
 	struct hd_geometry geo;
 	sector_t ssize;
@@ -206,6 +207,7 @@ struct ktstate {
 int aoeblk_init(void);
 void aoeblk_exit(void);
 void aoeblk_gdalloc(void *);
+void aoedisk_rm_debugfs(struct aoedev *d);
 void aoedisk_rm_sysfs(struct aoedev *d);
 
 int aoechr_init(void);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index cb508b754377..d76c5cb3c708 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -132,6 +132,40 @@ static const struct attribute_group attr_group = {
 	.attrs = aoe_attrs,
 };
 
+static const struct file_operations aoe_debugfs_fops;
+
+static void
+aoedisk_add_debugfs(struct aoedev *d)
+{
+	struct dentry *entry;
+	char *p;
+
+	if (aoe_debugfs_dir == NULL)
+		return;
+	p = strchr(d->gd->disk_name, '/');
+	if (p == NULL)
+		p = d->gd->disk_name;
+	else
+		p++;
+	BUG_ON(*p == '\0');
+	entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
+				    &aoe_debugfs_fops);
+	if (IS_ERR_OR_NULL(entry)) {
+		pr_info("aoe: cannot create debugfs file for %s\n",
+			d->gd->disk_name);
+		return;
+	}
+	BUG_ON(d->debugfs);
+	d->debugfs = entry;
+}
+void
+aoedisk_rm_debugfs(struct aoedev *d)
+{
+	BUG_ON(d->debugfs == NULL);
+	debugfs_remove(d->debugfs);
+	d->debugfs = NULL;
+}
+
 static int
 aoedisk_add_sysfs(struct aoedev *d)
 {
@@ -332,6 +366,7 @@ aoeblk_gdalloc(void *vp)
 
 	add_disk(gd);
 	aoedisk_add_sysfs(d);
+	aoedisk_add_debugfs(d);
 
 	spin_lock_irqsave(&d->lock, flags);
 	WARN_ON(!(d->flags & DEVFL_GD_NOW));
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 784c92e038d1..c9047675dfc9 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -278,6 +278,7 @@ freedev(struct aoedev *d)
 
 	del_timer_sync(&d->timer);
 	if (d->gd) {
+		aoedisk_rm_debugfs(d);
 		aoedisk_rm_sysfs(d);
 		del_gendisk(d->gd);
 		put_disk(d->gd);

From 1cf94797c2bbc514c2bd0892e1774a0a8ca9afdb Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:41 -0700
Subject: [PATCH 258/303] aoe: provide file operations for debugfs files

The place holder in the file contents is filled out in the following
patch.

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoeblk.c | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index d76c5cb3c708..0511d38e412d 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -110,6 +110,24 @@ static ssize_t aoedisk_show_payload(struct device *dev,
 	return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt);
 }
 
+static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
+{
+	struct aoedev *d;
+	unsigned long flags;
+
+	d = s->private;
+	spin_lock_irqsave(&d->lock, flags);
+	seq_printf(s, "%s\n", d->gd->disk_name); /* place holder */
+	spin_unlock_irqrestore(&d->lock, flags);
+
+	return 0;
+}
+
+static int aoe_debugfs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, aoedisk_debugfs_show, inode->i_private);
+}
+
 static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL);
 static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL);
 static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL);
@@ -132,7 +150,12 @@ static const struct attribute_group attr_group = {
 	.attrs = aoe_attrs,
 };
 
-static const struct file_operations aoe_debugfs_fops;
+static const struct file_operations aoe_debugfs_fops = {
+	.open = aoe_debugfs_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
 
 static void
 aoedisk_add_debugfs(struct aoedev *d)

From 2256c1c51e98d4eb2063a7f84f9ea783fda95f7f Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:42 -0700
Subject: [PATCH 259/303] aoe: fill in per-AoE-target information for debugfs
 file

This information is presented in a compact format that has evolved for
easy routine scanning by expert humans, mostly developers and support
technicians helping to troubleshoot or test AoE-based systems.

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoeblk.c | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 0511d38e412d..b58cbeb43e05 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -113,11 +113,42 @@ static ssize_t aoedisk_show_payload(struct device *dev,
 static int aoedisk_debugfs_show(struct seq_file *s, void *ignored)
 {
 	struct aoedev *d;
+	struct aoetgt **t, **te;
+	struct aoeif *ifp, *ife;
 	unsigned long flags;
+	char c;
 
 	d = s->private;
+	seq_printf(s, "rttavg: %d rttdev: %d\n",
+		d->rttavg >> RTTSCALE,
+		d->rttdev >> RTTDSCALE);
+	seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool));
+	seq_printf(s, "kicked: %ld\n", d->kicked);
+	seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt);
+	seq_printf(s, "ref: %ld\n", d->ref);
+
 	spin_lock_irqsave(&d->lock, flags);
-	seq_printf(s, "%s\n", d->gd->disk_name); /* place holder */
+	t = d->targets;
+	te = t + d->ntargets;
+	for (; t < te && *t; t++) {
+		c = '\t';
+		seq_printf(s, "falloc: %ld\n", (*t)->falloc);
+		seq_printf(s, "ffree: %p\n",
+			list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next);
+		seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout,
+			(*t)->maxout, (*t)->nframes);
+		seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh);
+		seq_printf(s, "\ttaint:%d\n", (*t)->taint);
+		seq_printf(s, "\tr:%d\n", (*t)->rpkts);
+		seq_printf(s, "\tw:%d\n", (*t)->wpkts);
+		ifp = (*t)->ifs;
+		ife = ifp + ARRAY_SIZE((*t)->ifs);
+		for (; ifp->nd && ifp < ife; ifp++) {
+			seq_printf(s, "%c%s", c, ifp->nd->name);
+			c = ',';
+		}
+		seq_puts(s, "\n");
+	}
 	spin_unlock_irqrestore(&d->lock, flags);
 
 	return 0;

From ec345120c571847dea3d3bef76dd9b7978fa794e Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:43 -0700
Subject: [PATCH 260/303] aoe: update copyright date

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoeblk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index b58cbeb43e05..d63dcf0f2266 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
 /*
  * aoeblk.c
  * block device routines

From 896dcd9a64a86d8792302615ee8ab118dc8afd9c Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:44 -0700
Subject: [PATCH 261/303] aoe: update internal version number to 85

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoe.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index b1f24c5a6bd1..14a9d1912318 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
 /* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
-#define VERSION "83"
+#define VERSION "85"
 #define AOE_MAJOR 152
 #define DEVICE_NAME "aoe"
 

From a88c1f0caccaa335690d53ea03b12de31c357263 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 11 Sep 2013 14:25:44 -0700
Subject: [PATCH 262/303] aoe: remove custom implementation of kbasename()

In the kernel we have a nice helper that may be used here. This patch
substitutes the custom implementation by the native function call.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoedev.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index c9047675dfc9..e774c50b6842 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -12,6 +12,7 @@
 #include <linux/bitmap.h>
 #include <linux/kdev_t.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
 #include "aoe.h"
 
 static void dummy_timer(ulong);
@@ -241,16 +242,12 @@ aoedev_downdev(struct aoedev *d)
 static int
 user_req(char *s, size_t slen, struct aoedev *d)
 {
-	char *p;
+	const char *p;
 	size_t lim;
 
 	if (!d->gd)
 		return 0;
-	p = strrchr(d->gd->disk_name, '/');
-	if (!p)
-		p = d->gd->disk_name;
-	else
-		p += 1;
+	p = kbasename(d->gd->disk_name);
 	lim = sizeof(d->gd->disk_name);
 	lim -= p - d->gd->disk_name;
 	if (slen < lim)

From e0ec36059774ff51812b40509d28ca6c9a2a6a62 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 11 Sep 2013 14:25:45 -0700
Subject: [PATCH 263/303] aoe: suppress compiler warnings

This patch fixes following compiler warnings:

  drivers/block/aoe/aoecmd.c: In function `aoecmd_ata_rw':
  drivers/block/aoe/aoecmd.c:383:17: warning: variable `t' set but not used [-Wunused-but-set-variable]
    struct aoetgt *t;
                   ^
  drivers/block/aoe/aoecmd.c: In function `resend':
  drivers/block/aoe/aoecmd.c:488:21: warning: variable `ah' set but not used [-Wunused-but-set-variable]
    struct aoe_atahdr *ah;
                       ^

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoecmd.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 4d45dba7fb8f..d2515435e23f 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -380,7 +380,6 @@ aoecmd_ata_rw(struct aoedev *d)
 {
 	struct frame *f;
 	struct buf *buf;
-	struct aoetgt *t;
 	struct sk_buff *skb;
 	struct sk_buff_head queue;
 	ulong bcnt, fbcnt;
@@ -391,7 +390,6 @@ aoecmd_ata_rw(struct aoedev *d)
 	f = newframe(d);
 	if (f == NULL)
 		return 0;
-	t = *d->tgt;
 	bcnt = d->maxbcnt;
 	if (bcnt == 0)
 		bcnt = DEFAULTBCNT;
@@ -485,7 +483,6 @@ resend(struct aoedev *d, struct frame *f)
 	struct sk_buff *skb;
 	struct sk_buff_head queue;
 	struct aoe_hdr *h;
-	struct aoe_atahdr *ah;
 	struct aoetgt *t;
 	char buf[128];
 	u32 n;
@@ -500,7 +497,6 @@ resend(struct aoedev *d, struct frame *f)
 		return;
 	}
 	h = (struct aoe_hdr *) skb_mac_header(skb);
-	ah = (struct aoe_atahdr *) (h+1);
 
 	if (!(f->flags & FFL_PROBE)) {
 		snprintf(buf, sizeof(buf),

From fea1b139735355fe17a66f63811c58698ff03ec5 Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:46 -0700
Subject: [PATCH 264/303] aoe: do not BUG if memory pressure prevented debugfs
 file creation

If the system has trouble allocating memory for the creation of the aoe
debugfs directory or of a file inside it, the debugfs member of an aoedev
can be NULL.

Do not treat a NULL debugfs pointer as a BUG on aoedev shutdown, avoiding
the user impact of an unecessary panic.

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/aoe/aoeblk.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index d63dcf0f2266..dd73e1ff1759 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -215,7 +215,6 @@ aoedisk_add_debugfs(struct aoedev *d)
 void
 aoedisk_rm_debugfs(struct aoedev *d)
 {
-	BUG_ON(d->debugfs == NULL);
 	debugfs_remove(d->debugfs);
 	d->debugfs = NULL;
 }

From 5173b414e42cb81e764f5e92a2f143e9a84fa3d1 Mon Sep 17 00:00:00 2001
From: Ed Cashin <ecashin@coraid.com>
Date: Wed, 11 Sep 2013 14:25:47 -0700
Subject: [PATCH 265/303] aoe: remove do-nothing NAME="%k" term from example
 udev rules

When the example udev rules in the documentation are used without
modification, warnings like the one shown below appear in the system logs:

  /var/log/messages:Aug 22 11:09:11 kung udevd[445]: NAME="%k" \
    is superfluous and breaks kernel supplied names, please remove \
    it from /etc/udev/rules.d/60-aoe.rules:26

Removing the term does not cause any problems with the creation of the
special character and block device nodes.

Signed-off-by: Ed Cashin <ecashin@coraid.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/aoe/udev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt
index 8686e789542e..1f06daf03f5b 100644
--- a/Documentation/aoe/udev.txt
+++ b/Documentation/aoe/udev.txt
@@ -23,4 +23,4 @@ SUBSYSTEM=="aoe", KERNEL=="revalidate",	NAME="etherd/%k", GROUP="disk", MODE="02
 SUBSYSTEM=="aoe", KERNEL=="flush",	NAME="etherd/%k", GROUP="disk", MODE="0220"
 
 # aoe block devices     
-KERNEL=="etherd*",       NAME="%k", GROUP="disk"
+KERNEL=="etherd*",       GROUP="disk"

From 6325932666540beea18c800016368dc921068611 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 11 Sep 2013 14:25:48 -0700
Subject: [PATCH 266/303] affs: use loff_t in affs_truncate()

It seems pretty unlikely that AFFS supports files over 4GB but we may as
well leave use loff_t just for cleanness sake instead of truncating it to
32 bits.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Marco Stornelli <marco.stornelli@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/affs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/affs/file.c b/fs/affs/file.c
index af3261b78102..776e3935a758 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -836,7 +836,7 @@ affs_truncate(struct inode *inode)
 		struct address_space *mapping = inode->i_mapping;
 		struct page *page;
 		void *fsdata;
-		u32 size = inode->i_size;
+		loff_t size = inode->i_size;
 		int res;
 
 		res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata);

From 6723734cdff15211bb78aeea76ca847374bd93ae Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 11 Sep 2013 14:25:49 -0700
Subject: [PATCH 267/303] panic: call panic handlers before kmsg_dump

Since the panic handlers may produce additional information (via printk)
for the kernel log, it should be reported as part of the panic output
saved by kmsg_dump().  Without this re-ordering, nothing that adds
information to a panic will show up in pstore's view when kmsg_dump runs,
and is therefore not visible to crash reporting tools that examine pstore
output.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Anton Vorontsov <anton@enomsg.org>
Cc: Colin Cross <ccross@android.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Vikram Mulukutla <markivx@codeaurora.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/panic.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/panic.c b/kernel/panic.c
index 801864600514..b6c482ccc5db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -123,10 +123,14 @@ void panic(const char *fmt, ...)
 	 */
 	smp_send_stop();
 
-	kmsg_dump(KMSG_DUMP_PANIC);
-
+	/*
+	 * Run any panic handlers, including those that might need to
+	 * add information to the kmsg dump output.
+	 */
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	kmsg_dump(KMSG_DUMP_PANIC);
+
 	bust_spinlocks(0);
 
 	if (!panic_blink)

From 5323fb770b6254a4c218a2dfb0ef9aa007b7725a Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:51 -0700
Subject: [PATCH 268/303] pktcdvd: convert ZONE macro to static function
 get_zone()

Macros should be converted to functions where feasible to
verify arguments and the like.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index f5d0ea11d9fd..aa18a2584249 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -83,9 +83,6 @@
 
 #define MAX_SPEED 0xffff
 
-#define ZONE(sector, pd) (((sector) + (pd)->offset) & \
-			~(sector_t)((pd)->settings.size - 1))
-
 static DEFINE_MUTEX(pktcdvd_mutex);
 static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
 static struct proc_dir_entry *pkt_proc;
@@ -103,7 +100,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
 static int pkt_remove_dev(dev_t pkt_dev);
 static int pkt_seq_show(struct seq_file *m, void *p);
 
-
+static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
+{
+	return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
+}
 
 /*
  * create and register a pktcdvd kernel object.
@@ -1224,7 +1224,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 	node = first_node;
 	while (node) {
 		bio = node->bio;
-		zone = ZONE(bio->bi_sector, pd);
+		zone = get_zone(bio->bi_sector, pd);
 		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
 			if (p->sector == zone) {
 				bio = NULL;
@@ -1264,8 +1264,8 @@ try_next_bio:
 	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
 		bio = node->bio;
 		VPRINTK("pkt_handle_queue: found zone=%llx\n",
-			(unsigned long long)ZONE(bio->bi_sector, pd));
-		if (ZONE(bio->bi_sector, pd) != zone)
+			(unsigned long long)get_zone(bio->bi_sector, pd));
+		if (get_zone(bio->bi_sector, pd) != zone)
 			break;
 		pkt_rbtree_erase(pd, node);
 		spin_lock(&pkt->lock);
@@ -2394,7 +2394,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	blk_queue_bounce(q, &bio);
 
-	zone = ZONE(bio->bi_sector, pd);
+	zone = get_zone(bio->bi_sector, pd);
 	VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_sector,
 		(unsigned long long)bio_end_sector(bio));
@@ -2405,7 +2405,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 		sector_t last_zone;
 		int first_sectors;
 
-		last_zone = ZONE(bio_end_sector(bio) - 1, pd);
+		last_zone = get_zone(bio_end_sector(bio) - 1, pd);
 		if (last_zone != zone) {
 			BUG_ON(last_zone != zone + pd->settings.size);
 			first_sectors = last_zone - bio->bi_sector;
@@ -2500,7 +2500,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
 			  struct bio_vec *bvec)
 {
 	struct pktcdvd_device *pd = q->queuedata;
-	sector_t zone = ZONE(bmd->bi_sector, pd);
+	sector_t zone = get_zone(bmd->bi_sector, pd);
 	int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size;
 	int remaining = (pd->settings.size << 9) - used;
 	int remaining2;

From 99481334bcab1330fce0c590b845b4a95b101a69 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:52 -0700
Subject: [PATCH 269/303] pktcdvd: convert printk to pr_<level>

Use a more current logging style and add messages levels to the logging
messages.

Simplify pkt_dump_sense by using %*ph and adding a simple function to emit
the sense string.

Includes improvements from Andy Shevchenko and Dan Carpenter.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 122 ++++++++++++++++++++--------------------
 1 file changed, 61 insertions(+), 61 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index aa18a2584249..eb71522c20ab 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -44,6 +44,8 @@
  *
  *************************************************************************/
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/pktcdvd.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -424,7 +426,7 @@ static int pkt_sysfs_init(void)
 	if (ret) {
 		kfree(class_pktcdvd);
 		class_pktcdvd = NULL;
-		printk(DRIVER_NAME": failed to create class pktcdvd\n");
+		pr_err("failed to create class pktcdvd\n");
 		return ret;
 	}
 	return 0;
@@ -734,36 +736,32 @@ out:
 	return ret;
 }
 
+static const char *sense_key_string(__u8 index)
+{
+	static const char * const info[] = {
+		"No sense", "Recovered error", "Not ready",
+		"Medium error", "Hardware error", "Illegal request",
+		"Unit attention", "Data protect", "Blank check",
+	};
+
+	return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
+}
+
 /*
  * A generic sense dump / resolve mechanism should be implemented across
  * all ATAPI + SCSI devices.
  */
 static void pkt_dump_sense(struct packet_command *cgc)
 {
-	static char *info[9] = { "No sense", "Recovered error", "Not ready",
-				 "Medium error", "Hardware error", "Illegal request",
-				 "Unit attention", "Data protect", "Blank check" };
-	int i;
 	struct request_sense *sense = cgc->sense;
 
-	printk(DRIVER_NAME":");
-	for (i = 0; i < CDROM_PACKET_SIZE; i++)
-		printk(" %02x", cgc->cmd[i]);
-	printk(" - ");
-
-	if (sense == NULL) {
-		printk("no sense\n");
-		return;
-	}
-
-	printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq);
-
-	if (sense->sense_key > 8) {
-		printk(" (INVALID)\n");
-		return;
-	}
-
-	printk(" (%s)\n", info[sense->sense_key]);
+	if (sense)
+		pr_err("%*ph - sense %02x.%02x.%02x (%s)\n",
+		       CDROM_PACKET_SIZE, cgc->cmd,
+		       sense->sense_key, sense->asc, sense->ascq,
+		       sense_key_string(sense->sense_key));
+	else
+		pr_err("%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
 }
 
 /*
@@ -943,7 +941,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
 		set_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
 	} else {
-		printk(DRIVER_NAME": cdrom max_phys_segments too small\n");
+		pr_err("cdrom max_phys_segments too small\n");
 		return -EIO;
 	}
 }
@@ -1563,9 +1561,10 @@ work_to_do:
 
 static void pkt_print_settings(struct pktcdvd_device *pd)
 {
-	printk(DRIVER_NAME": %s packets, ", pd->settings.fp ? "Fixed" : "Variable");
-	printk("%u blocks, ", pd->settings.size >> 2);
-	printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2');
+	pr_info("%s packets, %u blocks, Mode-%c disc\n",
+		pd->settings.fp ? "Fixed" : "Variable",
+		pd->settings.size >> 2,
+		pd->settings.block_mode == 8 ? '1' : '2');
 }
 
 static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
@@ -1749,7 +1748,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 		/*
 		 * paranoia
 		 */
-		printk(DRIVER_NAME": write mode wrong %d\n", wp->data_block_type);
+		pr_err("write mode wrong %d\n", wp->data_block_type);
 		return 1;
 	}
 	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
@@ -1793,7 +1792,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
 	if (ti->rt == 1 && ti->blank == 0)
 		return 1;
 
-	printk(DRIVER_NAME": bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+	pr_err("bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
 	return 0;
 }
 
@@ -1820,22 +1819,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 	 * but i'm not sure, should we leave this to user apps? probably.
 	 */
 	if (di->disc_type == 0xff) {
-		printk(DRIVER_NAME": Unknown disc. No track?\n");
+		pr_notice("unknown disc - no track?\n");
 		return 0;
 	}
 
 	if (di->disc_type != 0x20 && di->disc_type != 0) {
-		printk(DRIVER_NAME": Wrong disc type (%x)\n", di->disc_type);
+		pr_err("wrong disc type (%x)\n", di->disc_type);
 		return 0;
 	}
 
 	if (di->erasable == 0) {
-		printk(DRIVER_NAME": Disc not erasable\n");
+		pr_notice("disc not erasable\n");
 		return 0;
 	}
 
 	if (di->border_status == PACKET_SESSION_RESERVED) {
-		printk(DRIVER_NAME": Can't write to last track (reserved)\n");
+		pr_err("can't write to last track (reserved)\n");
 		return 0;
 	}
 
@@ -1860,7 +1859,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 	memset(&ti, 0, sizeof(track_information));
 
 	if ((ret = pkt_get_disc_info(pd, &di))) {
-		printk("failed get_disc\n");
+		pr_err("failed get_disc\n");
 		return ret;
 	}
 
@@ -1871,12 +1870,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 
 	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
 	if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
-		printk(DRIVER_NAME": failed get_track\n");
+		pr_err("failed get_track\n");
 		return ret;
 	}
 
 	if (!pkt_writable_track(pd, &ti)) {
-		printk(DRIVER_NAME": can't write to this track\n");
+		pr_err("can't write to this track\n");
 		return -EROFS;
 	}
 
@@ -1886,11 +1885,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 	 */
 	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
 	if (pd->settings.size == 0) {
-		printk(DRIVER_NAME": detected zero packet size!\n");
+		pr_notice("detected zero packet size!\n");
 		return -ENXIO;
 	}
 	if (pd->settings.size > PACKET_MAX_SECTORS) {
-		printk(DRIVER_NAME": packet size is too big\n");
+		pr_err("packet size is too big\n");
 		return -EROFS;
 	}
 	pd->settings.fp = ti.fp;
@@ -1932,7 +1931,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 			pd->settings.block_mode = PACKET_BLOCK_MODE2;
 			break;
 		default:
-			printk(DRIVER_NAME": unknown data mode\n");
+			pr_err("unknown data mode\n");
 			return -EROFS;
 	}
 	return 0;
@@ -1966,10 +1965,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
 	ret = pkt_mode_select(pd, &cgc);
 	if (ret) {
-		printk(DRIVER_NAME": write caching control failed\n");
+		pr_err("write caching control failed\n");
 		pkt_dump_sense(&cgc);
 	} else if (!ret && set)
-		printk(DRIVER_NAME": enabled write caching on %s\n", pd->name);
+		pr_notice("enabled write caching on %s\n", pd->name);
 	return ret;
 }
 
@@ -2084,11 +2083,11 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 	}
 
 	if (!(buf[6] & 0x40)) {
-		printk(DRIVER_NAME": Disc type is not CD-RW\n");
+		pr_notice("disc type is not CD-RW\n");
 		return 1;
 	}
 	if (!(buf[6] & 0x4)) {
-		printk(DRIVER_NAME": A1 values on media are not valid, maybe not CDRW?\n");
+		pr_notice("A1 values on media are not valid, maybe not CDRW?\n");
 		return 1;
 	}
 
@@ -2108,14 +2107,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 			*speed = us_clv_to_speed[sp];
 			break;
 		default:
-			printk(DRIVER_NAME": Unknown disc sub-type %d\n",st);
+			pr_notice("unknown disc sub-type %d\n", st);
 			return 1;
 	}
 	if (*speed) {
-		printk(DRIVER_NAME": Max. media speed: %d\n",*speed);
+		pr_info("maximum media speed: %d\n", *speed);
 		return 0;
 	} else {
-		printk(DRIVER_NAME": Unknown speed %d for sub-type %d\n",sp,st);
+		pr_notice("unknown speed %d for sub-type %d\n", sp, st);
 		return 1;
 	}
 }
@@ -2205,7 +2204,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 		goto out;
 
 	if ((ret = pkt_get_last_written(pd, &lba))) {
-		printk(DRIVER_NAME": pkt_get_last_written failed\n");
+		pr_err("pkt_get_last_written failed\n");
 		goto out_putdev;
 	}
 
@@ -2235,11 +2234,11 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 
 	if (write) {
 		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
-			printk(DRIVER_NAME": not enough memory for buffers\n");
+			pr_err("not enough memory for buffers\n");
 			ret = -ENOMEM;
 			goto out_putdev;
 		}
-		printk(DRIVER_NAME": %lukB available on disc\n", lba << 1);
+		pr_info("%lukB available on disc\n", lba << 1);
 	}
 
 	return 0;
@@ -2360,7 +2359,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	pd = q->queuedata;
 	if (!pd) {
-		printk(DRIVER_NAME": %s incorrect request queue\n", bdevname(bio->bi_bdev, b));
+		pr_err("%s incorrect request queue\n",
+		       bdevname(bio->bi_bdev, b));
 		goto end_io;
 	}
 
@@ -2382,13 +2382,13 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
-		printk(DRIVER_NAME": WRITE for ro device %s (%llu)\n",
-			pd->name, (unsigned long long)bio->bi_sector);
+		pr_notice("WRITE for ro device %s (%llu)\n",
+			  pd->name, (unsigned long long)bio->bi_sector);
 		goto end_io;
 	}
 
 	if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
-		printk(DRIVER_NAME": wrong bio size\n");
+		pr_err("wrong bio size\n");
 		goto end_io;
 	}
 
@@ -2609,7 +2609,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	struct block_device *bdev;
 
 	if (pd->pkt_dev == dev) {
-		printk(DRIVER_NAME": Recursive setup not allowed\n");
+		pr_err("recursive setup not allowed\n");
 		return -EBUSY;
 	}
 	for (i = 0; i < MAX_WRITERS; i++) {
@@ -2617,11 +2617,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		if (!pd2)
 			continue;
 		if (pd2->bdev->bd_dev == dev) {
-			printk(DRIVER_NAME": %s already setup\n", bdevname(pd2->bdev, b));
+			pr_err("%s already setup\n", bdevname(pd2->bdev, b));
 			return -EBUSY;
 		}
 		if (pd2->pkt_dev == dev) {
-			printk(DRIVER_NAME": Can't chain pktcdvd devices\n");
+			pr_err("can't chain pktcdvd devices\n");
 			return -EBUSY;
 		}
 	}
@@ -2644,7 +2644,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	atomic_set(&pd->cdrw.pending_bios, 0);
 	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
 	if (IS_ERR(pd->cdrw.thread)) {
-		printk(DRIVER_NAME": can't start kernel thread\n");
+		pr_err("can't start kernel thread\n");
 		ret = -ENOMEM;
 		goto out_mem;
 	}
@@ -2743,7 +2743,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
 		if (!pkt_devs[idx])
 			break;
 	if (idx == MAX_WRITERS) {
-		printk(DRIVER_NAME": max %d writers supported\n", MAX_WRITERS);
+		pr_err("max %d writers supported\n", MAX_WRITERS);
 		ret = -EBUSY;
 		goto out_mutex;
 	}
@@ -2818,7 +2818,7 @@ out_mem:
 	kfree(pd);
 out_mutex:
 	mutex_unlock(&ctl_mutex);
-	printk(DRIVER_NAME": setup of pktcdvd device failed\n");
+	pr_err("setup of pktcdvd device failed\n");
 	return ret;
 }
 
@@ -2969,7 +2969,7 @@ static int __init pkt_init(void)
 
 	ret = register_blkdev(pktdev_major, DRIVER_NAME);
 	if (ret < 0) {
-		printk(DRIVER_NAME": Unable to register block device\n");
+		pr_err("unable to register block device\n");
 		goto out2;
 	}
 	if (!pktdev_major)
@@ -2983,7 +2983,7 @@ static int __init pkt_init(void)
 
 	ret = misc_register(&pkt_misc);
 	if (ret) {
-		printk(DRIVER_NAME": Unable to register misc device\n");
+		pr_err("unable to register misc device\n");
 		goto out_misc;
 	}
 

From cd3f2cd05cf1066f3975ed3223f12c799bc553c6 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:53 -0700
Subject: [PATCH 270/303] pktcdvd: consolidate DPRINTK and VPRINTK macros

Use the more common pkt_dbg(level, fmt, ...) form.

These messages are emitted at KERN_NOTICE.

Always emit function name with pkt_dbg(2, ...) uses and remove the
sometimes abbreviated embedded function name.

This form always verifies the format and arguments.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 107 ++++++++++++++++++++--------------------
 1 file changed, 53 insertions(+), 54 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index eb71522c20ab..aaa5da2b3d66 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -71,17 +71,13 @@
 
 #define DRIVER_NAME	"pktcdvd"
 
-#if PACKET_DEBUG
-#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
-#else
-#define DPRINTK(fmt, args...)
-#endif
-
-#if PACKET_DEBUG > 1
-#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args)
-#else
-#define VPRINTK(fmt, args...)
-#endif
+#define pkt_dbg(level, fmt, ...)				\
+do {								\
+	if (level == 2 && PACKET_DEBUG >= 2)			\
+		pr_notice("%s: " fmt, __func__, ##__VA_ARGS__);	\
+	else if (level == 1 && PACKET_DEBUG >= 1)		\
+		pr_notice(fmt, ##__VA_ARGS__);			\
+} while (0)
 
 #define MAX_SPEED 0xffff
 
@@ -519,7 +515,7 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
 {
 	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
 	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
-		VPRINTK(DRIVER_NAME": queue empty\n");
+		pkt_dbg(2, "queue empty\n");
 		atomic_set(&pd->iosched.attention, 1);
 		wake_up(&pd->wqueue);
 	}
@@ -870,7 +866,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 				need_write_seek = 0;
 			if (need_write_seek && reads_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					VPRINTK(DRIVER_NAME": write, waiting\n");
+					pkt_dbg(2, "write, waiting\n");
 					break;
 				}
 				pkt_flush_cache(pd);
@@ -879,7 +875,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 		} else {
 			if (!reads_queued && writes_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					VPRINTK(DRIVER_NAME": read, waiting\n");
+					pkt_dbg(2, "read, waiting\n");
 					break;
 				}
 				pd->iosched.writing = 1;
@@ -985,8 +981,9 @@ static void pkt_end_io_read(struct bio *bio, int err)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio,
-		(unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err);
+	pkt_dbg(2, "bio=%p sec0=%llx sec=%llx err=%d\n",
+		bio, (unsigned long long)pkt->sector,
+		(unsigned long long)bio->bi_sector, err);
 
 	if (err)
 		atomic_inc(&pkt->io_errors);
@@ -1003,7 +1000,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err);
+	pkt_dbg(2, "id=%d, err=%d\n", pkt->id, err);
 
 	pd->stats.pkt_ended++;
 
@@ -1045,7 +1042,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	spin_unlock(&pkt->lock);
 
 	if (pkt->cache_valid) {
-		VPRINTK("pkt_gather_data: zone %llx cached\n",
+		pkt_dbg(2, "zone %llx cached\n",
 			(unsigned long long)pkt->sector);
 		goto out_account;
 	}
@@ -1068,7 +1065,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
 		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-		VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n",
+		pkt_dbg(2, "Adding frame %d, page:%p offs:%d\n",
 			f, pkt->pages[p], offset);
 		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
 			BUG();
@@ -1080,7 +1077,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	}
 
 out_account:
-	VPRINTK("pkt_gather_data: need %d frames for zone %llx\n",
+	pkt_dbg(2, "need %d frames for zone %llx\n",
 		frames_read, (unsigned long long)pkt->sector);
 	pd->stats.pkt_started++;
 	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
@@ -1181,7 +1178,8 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state
 		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
 	};
 	enum packet_data_state old_state = pkt->state;
-	VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector,
+	pkt_dbg(2, "pkt %2d : s=%6llx %s -> %s\n",
+		pkt->id, (unsigned long long)pkt->sector,
 		state_name[old_state], state_name[state]);
 #endif
 	pkt->state = state;
@@ -1200,12 +1198,12 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 	struct rb_node *n;
 	int wakeup;
 
-	VPRINTK("handle_queue\n");
+	pkt_dbg(2, "\n");
 
 	atomic_set(&pd->scan_queue, 0);
 
 	if (list_empty(&pd->cdrw.pkt_free_list)) {
-		VPRINTK("handle_queue: no pkt\n");
+		pkt_dbg(2, "no pkt\n");
 		return 0;
 	}
 
@@ -1242,7 +1240,7 @@ try_next_bio:
 	}
 	spin_unlock(&pd->lock);
 	if (!bio) {
-		VPRINTK("handle_queue: no bio\n");
+		pkt_dbg(2, "no bio\n");
 		return 0;
 	}
 
@@ -1258,10 +1256,10 @@ try_next_bio:
 	 * to this packet.
 	 */
 	spin_lock(&pd->lock);
-	VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone);
+	pkt_dbg(2, "looking for zone %llx\n", (unsigned long long)zone);
 	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
 		bio = node->bio;
-		VPRINTK("pkt_handle_queue: found zone=%llx\n",
+		pkt_dbg(2, "found zone=%llx\n",
 			(unsigned long long)get_zone(bio->bi_sector, pd));
 		if (get_zone(bio->bi_sector, pd) != zone)
 			break;
@@ -1314,7 +1312,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 		if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
 			BUG();
 	}
-	VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
+	pkt_dbg(2, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
 
 	/*
 	 * Fill-in bvec with data from orig_bios.
@@ -1325,7 +1323,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
 	spin_unlock(&pkt->lock);
 
-	VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
+	pkt_dbg(2, "Writing %d frames for zone %llx\n",
 		pkt->write_size, (unsigned long long)pkt->sector);
 
 	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
@@ -1357,7 +1355,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 {
 	int uptodate;
 
-	VPRINTK("run_state_machine: pkt %d\n", pkt->id);
+	pkt_dbg(2, "pkt %d\n", pkt->id);
 
 	for (;;) {
 		switch (pkt->state) {
@@ -1396,7 +1394,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			if (pkt_start_recovery(pkt)) {
 				pkt_start_write(pd, pkt);
 			} else {
-				VPRINTK("No recovery possible\n");
+				pkt_dbg(2, "No recovery possible\n");
 				pkt_set_state(pkt, PACKET_FINISHED_STATE);
 			}
 			break;
@@ -1417,7 +1415,7 @@ static void pkt_handle_packets(struct pktcdvd_device *pd)
 {
 	struct packet_data *pkt, *next;
 
-	VPRINTK("pkt_handle_packets\n");
+	pkt_dbg(2, "\n");
 
 	/*
 	 * Run state machine for active packets
@@ -1500,9 +1498,9 @@ static int kcdrwd(void *foobar)
 			if (PACKET_DEBUG > 1) {
 				int states[PACKET_NUM_STATES];
 				pkt_count_states(pd, states);
-				VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
-					states[0], states[1], states[2], states[3],
-					states[4], states[5]);
+				pkt_dbg(2, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+					states[0], states[1], states[2],
+					states[3], states[4], states[5]);
 			}
 
 			min_sleep_time = MAX_SCHEDULE_TIMEOUT;
@@ -1511,9 +1509,9 @@ static int kcdrwd(void *foobar)
 					min_sleep_time = pkt->sleep_time;
 			}
 
-			VPRINTK("kcdrwd: sleeping\n");
+			pkt_dbg(2, "sleeping\n");
 			residue = schedule_timeout(min_sleep_time);
-			VPRINTK("kcdrwd: wake up\n");
+			pkt_dbg(2, "wake up\n");
 
 			/* make swsusp happy with our thread */
 			try_to_freeze();
@@ -1810,7 +1808,8 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 		case 0x12: /* DVD-RAM */
 			return 1;
 		default:
-			VPRINTK(DRIVER_NAME": Wrong disc profile (%x)\n", pd->mmc3_profile);
+			pkt_dbg(2, "Wrong disc profile (%x)\n",
+				pd->mmc3_profile);
 			return 0;
 	}
 
@@ -2125,7 +2124,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
 	struct request_sense sense;
 	int ret;
 
-	VPRINTK(DRIVER_NAME": Performing OPC\n");
+	pkt_dbg(2, "Performing OPC\n");
 
 	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
 	cgc.sense = &sense;
@@ -2143,12 +2142,12 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 	unsigned int write_speed, media_write_speed, read_speed;
 
 	if ((ret = pkt_probe_settings(pd))) {
-		VPRINTK(DRIVER_NAME": %s failed probe\n", pd->name);
+		pkt_dbg(2, "%s failed probe\n", pd->name);
 		return ret;
 	}
 
 	if ((ret = pkt_set_write_settings(pd))) {
-		DPRINTK(DRIVER_NAME": %s failed saving write settings\n", pd->name);
+		pkt_dbg(1, "%s failed saving write settings\n", pd->name);
 		return -EIO;
 	}
 
@@ -2160,26 +2159,26 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 		case 0x13: /* DVD-RW */
 		case 0x1a: /* DVD+RW */
 		case 0x12: /* DVD-RAM */
-			DPRINTK(DRIVER_NAME": write speed %ukB/s\n", write_speed);
+			pkt_dbg(1, "write speed %ukB/s\n", write_speed);
 			break;
 		default:
 			if ((ret = pkt_media_speed(pd, &media_write_speed)))
 				media_write_speed = 16;
 			write_speed = min(write_speed, media_write_speed * 177);
-			DPRINTK(DRIVER_NAME": write speed %ux\n", write_speed / 176);
+			pkt_dbg(1, "write speed %ux\n", write_speed / 176);
 			break;
 	}
 	read_speed = write_speed;
 
 	if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
-		DPRINTK(DRIVER_NAME": %s couldn't set write speed\n", pd->name);
+		pkt_dbg(1, "%s couldn't set write speed\n", pd->name);
 		return -EIO;
 	}
 	pd->write_speed = write_speed;
 	pd->read_speed = read_speed;
 
 	if ((ret = pkt_perform_opc(pd))) {
-		DPRINTK(DRIVER_NAME": %s Optimum Power Calibration failed\n", pd->name);
+		pkt_dbg(1, "%s Optimum Power Calibration failed\n", pd->name);
 	}
 
 	return 0;
@@ -2256,7 +2255,7 @@ out:
 static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 {
 	if (flush && pkt_flush_cache(pd))
-		DPRINTK(DRIVER_NAME": %s not flushing cache\n", pd->name);
+		pkt_dbg(1, "%s not flushing cache\n", pd->name);
 
 	pkt_lock_door(pd, 0);
 
@@ -2278,7 +2277,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
 	struct pktcdvd_device *pd = NULL;
 	int ret;
 
-	VPRINTK(DRIVER_NAME": entering open\n");
+	pkt_dbg(2, "entering\n");
 
 	mutex_lock(&pktcdvd_mutex);
 	mutex_lock(&ctl_mutex);
@@ -2314,7 +2313,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
 out_dec:
 	pd->refcnt--;
 out:
-	VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
+	pkt_dbg(2, "failed (%d)\n", ret);
 	mutex_unlock(&ctl_mutex);
 	mutex_unlock(&pktcdvd_mutex);
 	return ret;
@@ -2395,7 +2394,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	zone = get_zone(bio->bi_sector, pd);
-	VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
+	pkt_dbg(2, "start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_sector,
 		(unsigned long long)bio_end_sector(bio));
 
@@ -2650,7 +2649,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	}
 
 	proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
-	DPRINTK(DRIVER_NAME": writer %s mapped to %s\n", pd->name, bdevname(bdev, b));
+	pkt_dbg(1, "writer %s mapped to %s\n", pd->name, bdevname(bdev, b));
 	return 0;
 
 out_mem:
@@ -2665,8 +2664,8 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	struct pktcdvd_device *pd = bdev->bd_disk->private_data;
 	int ret;
 
-	VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
-		MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+	pkt_dbg(2, "cmd %x, dev %d:%d\n",
+		cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 
 	mutex_lock(&pktcdvd_mutex);
 	switch (cmd) {
@@ -2690,7 +2689,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		break;
 
 	default:
-		VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
+		pkt_dbg(2, "Unknown ioctl for %s (%x)\n", pd->name, cmd);
 		ret = -ENOTTY;
 	}
 	mutex_unlock(&pktcdvd_mutex);
@@ -2839,7 +2838,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 			break;
 	}
 	if (idx == MAX_WRITERS) {
-		DPRINTK(DRIVER_NAME": dev not setup\n");
+		pkt_dbg(1, "dev not setup\n");
 		ret = -ENXIO;
 		goto out;
 	}
@@ -2859,7 +2858,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
 
 	remove_proc_entry(pd->name, pkt_proc);
-	DPRINTK(DRIVER_NAME": writer %s unmapped\n", pd->name);
+	pkt_dbg(1, "writer %s unmapped\n", pd->name);
 
 	del_gendisk(pd->disk);
 	blk_cleanup_queue(pd->disk->queue);

From 844aa7974395df1f0c7a866007e900e5f979fc7b Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:54 -0700
Subject: [PATCH 271/303] pktcdvd: add struct pktcdvd_device * to pkt_dbg

Add pd->name to output for these debugging messages.

Remove normally compiled out pkt_dbg(2, ...) function entry tracing
equivalents as it's better done via the function tracer.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 90 +++++++++++++++++++----------------------
 1 file changed, 42 insertions(+), 48 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index aaa5da2b3d66..5050c7abb6ee 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -71,12 +71,13 @@
 
 #define DRIVER_NAME	"pktcdvd"
 
-#define pkt_dbg(level, fmt, ...)				\
-do {								\
-	if (level == 2 && PACKET_DEBUG >= 2)			\
-		pr_notice("%s: " fmt, __func__, ##__VA_ARGS__);	\
-	else if (level == 1 && PACKET_DEBUG >= 1)		\
-		pr_notice(fmt, ##__VA_ARGS__);			\
+#define pkt_dbg(level, pd, fmt, ...)					\
+do {									\
+	if (level == 2 && PACKET_DEBUG >= 2)				\
+		pr_notice("%s: %s():" fmt,				\
+			  pd->name, __func__, ##__VA_ARGS__);		\
+	else if (level == 1 && PACKET_DEBUG >= 1)			\
+		pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__);		\
 } while (0)
 
 #define MAX_SPEED 0xffff
@@ -515,7 +516,7 @@ static void pkt_bio_finished(struct pktcdvd_device *pd)
 {
 	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
 	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
-		pkt_dbg(2, "queue empty\n");
+		pkt_dbg(2, pd, "queue empty\n");
 		atomic_set(&pd->iosched.attention, 1);
 		wake_up(&pd->wqueue);
 	}
@@ -866,7 +867,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 				need_write_seek = 0;
 			if (need_write_seek && reads_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					pkt_dbg(2, "write, waiting\n");
+					pkt_dbg(2, pd, "write, waiting\n");
 					break;
 				}
 				pkt_flush_cache(pd);
@@ -875,7 +876,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 		} else {
 			if (!reads_queued && writes_queued) {
 				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
-					pkt_dbg(2, "read, waiting\n");
+					pkt_dbg(2, pd, "read, waiting\n");
 					break;
 				}
 				pd->iosched.writing = 1;
@@ -981,7 +982,7 @@ static void pkt_end_io_read(struct bio *bio, int err)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	pkt_dbg(2, "bio=%p sec0=%llx sec=%llx err=%d\n",
+	pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
 		bio, (unsigned long long)pkt->sector,
 		(unsigned long long)bio->bi_sector, err);
 
@@ -1000,7 +1001,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err)
 	struct pktcdvd_device *pd = pkt->pd;
 	BUG_ON(!pd);
 
-	pkt_dbg(2, "id=%d, err=%d\n", pkt->id, err);
+	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err);
 
 	pd->stats.pkt_ended++;
 
@@ -1042,7 +1043,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	spin_unlock(&pkt->lock);
 
 	if (pkt->cache_valid) {
-		pkt_dbg(2, "zone %llx cached\n",
+		pkt_dbg(2, pd, "zone %llx cached\n",
 			(unsigned long long)pkt->sector);
 		goto out_account;
 	}
@@ -1065,7 +1066,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 
 		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
 		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-		pkt_dbg(2, "Adding frame %d, page:%p offs:%d\n",
+		pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n",
 			f, pkt->pages[p], offset);
 		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
 			BUG();
@@ -1077,7 +1078,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
 	}
 
 out_account:
-	pkt_dbg(2, "need %d frames for zone %llx\n",
+	pkt_dbg(2, pd, "need %d frames for zone %llx\n",
 		frames_read, (unsigned long long)pkt->sector);
 	pd->stats.pkt_started++;
 	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
@@ -1178,7 +1179,7 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state
 		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
 	};
 	enum packet_data_state old_state = pkt->state;
-	pkt_dbg(2, "pkt %2d : s=%6llx %s -> %s\n",
+	pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n",
 		pkt->id, (unsigned long long)pkt->sector,
 		state_name[old_state], state_name[state]);
 #endif
@@ -1198,12 +1199,10 @@ static int pkt_handle_queue(struct pktcdvd_device *pd)
 	struct rb_node *n;
 	int wakeup;
 
-	pkt_dbg(2, "\n");
-
 	atomic_set(&pd->scan_queue, 0);
 
 	if (list_empty(&pd->cdrw.pkt_free_list)) {
-		pkt_dbg(2, "no pkt\n");
+		pkt_dbg(2, pd, "no pkt\n");
 		return 0;
 	}
 
@@ -1240,7 +1239,7 @@ try_next_bio:
 	}
 	spin_unlock(&pd->lock);
 	if (!bio) {
-		pkt_dbg(2, "no bio\n");
+		pkt_dbg(2, pd, "no bio\n");
 		return 0;
 	}
 
@@ -1256,10 +1255,10 @@ try_next_bio:
 	 * to this packet.
 	 */
 	spin_lock(&pd->lock);
-	pkt_dbg(2, "looking for zone %llx\n", (unsigned long long)zone);
+	pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
 	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
 		bio = node->bio;
-		pkt_dbg(2, "found zone=%llx\n",
+		pkt_dbg(2, pd, "found zone=%llx\n",
 			(unsigned long long)get_zone(bio->bi_sector, pd));
 		if (get_zone(bio->bi_sector, pd) != zone)
 			break;
@@ -1312,7 +1311,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 		if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
 			BUG();
 	}
-	pkt_dbg(2, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
+	pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
 
 	/*
 	 * Fill-in bvec with data from orig_bios.
@@ -1323,7 +1322,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
 	spin_unlock(&pkt->lock);
 
-	pkt_dbg(2, "Writing %d frames for zone %llx\n",
+	pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
 		pkt->write_size, (unsigned long long)pkt->sector);
 
 	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
@@ -1355,7 +1354,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 {
 	int uptodate;
 
-	pkt_dbg(2, "pkt %d\n", pkt->id);
+	pkt_dbg(2, pd, "pkt %d\n", pkt->id);
 
 	for (;;) {
 		switch (pkt->state) {
@@ -1394,7 +1393,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data
 			if (pkt_start_recovery(pkt)) {
 				pkt_start_write(pd, pkt);
 			} else {
-				pkt_dbg(2, "No recovery possible\n");
+				pkt_dbg(2, pd, "No recovery possible\n");
 				pkt_set_state(pkt, PACKET_FINISHED_STATE);
 			}
 			break;
@@ -1415,8 +1414,6 @@ static void pkt_handle_packets(struct pktcdvd_device *pd)
 {
 	struct packet_data *pkt, *next;
 
-	pkt_dbg(2, "\n");
-
 	/*
 	 * Run state machine for active packets
 	 */
@@ -1498,7 +1495,7 @@ static int kcdrwd(void *foobar)
 			if (PACKET_DEBUG > 1) {
 				int states[PACKET_NUM_STATES];
 				pkt_count_states(pd, states);
-				pkt_dbg(2, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+				pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
 					states[0], states[1], states[2],
 					states[3], states[4], states[5]);
 			}
@@ -1509,9 +1506,9 @@ static int kcdrwd(void *foobar)
 					min_sleep_time = pkt->sleep_time;
 			}
 
-			pkt_dbg(2, "sleeping\n");
+			pkt_dbg(2, pd, "sleeping\n");
 			residue = schedule_timeout(min_sleep_time);
-			pkt_dbg(2, "wake up\n");
+			pkt_dbg(2, pd, "wake up\n");
 
 			/* make swsusp happy with our thread */
 			try_to_freeze();
@@ -1808,7 +1805,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 		case 0x12: /* DVD-RAM */
 			return 1;
 		default:
-			pkt_dbg(2, "Wrong disc profile (%x)\n",
+			pkt_dbg(2, pd, "Wrong disc profile (%x)\n",
 				pd->mmc3_profile);
 			return 0;
 	}
@@ -2124,7 +2121,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
 	struct request_sense sense;
 	int ret;
 
-	pkt_dbg(2, "Performing OPC\n");
+	pkt_dbg(2, pd, "Performing OPC\n");
 
 	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
 	cgc.sense = &sense;
@@ -2142,12 +2139,12 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 	unsigned int write_speed, media_write_speed, read_speed;
 
 	if ((ret = pkt_probe_settings(pd))) {
-		pkt_dbg(2, "%s failed probe\n", pd->name);
+		pkt_dbg(2, pd, "failed probe\n");
 		return ret;
 	}
 
 	if ((ret = pkt_set_write_settings(pd))) {
-		pkt_dbg(1, "%s failed saving write settings\n", pd->name);
+		pkt_dbg(1, pd, "failed saving write settings\n");
 		return -EIO;
 	}
 
@@ -2159,26 +2156,26 @@ static int pkt_open_write(struct pktcdvd_device *pd)
 		case 0x13: /* DVD-RW */
 		case 0x1a: /* DVD+RW */
 		case 0x12: /* DVD-RAM */
-			pkt_dbg(1, "write speed %ukB/s\n", write_speed);
+			pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
 			break;
 		default:
 			if ((ret = pkt_media_speed(pd, &media_write_speed)))
 				media_write_speed = 16;
 			write_speed = min(write_speed, media_write_speed * 177);
-			pkt_dbg(1, "write speed %ux\n", write_speed / 176);
+			pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
 			break;
 	}
 	read_speed = write_speed;
 
 	if ((ret = pkt_set_speed(pd, write_speed, read_speed))) {
-		pkt_dbg(1, "%s couldn't set write speed\n", pd->name);
+		pkt_dbg(1, pd, "couldn't set write speed\n");
 		return -EIO;
 	}
 	pd->write_speed = write_speed;
 	pd->read_speed = read_speed;
 
 	if ((ret = pkt_perform_opc(pd))) {
-		pkt_dbg(1, "%s Optimum Power Calibration failed\n", pd->name);
+		pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
 	}
 
 	return 0;
@@ -2255,7 +2252,7 @@ out:
 static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
 {
 	if (flush && pkt_flush_cache(pd))
-		pkt_dbg(1, "%s not flushing cache\n", pd->name);
+		pkt_dbg(1, pd, "not flushing cache\n");
 
 	pkt_lock_door(pd, 0);
 
@@ -2277,8 +2274,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
 	struct pktcdvd_device *pd = NULL;
 	int ret;
 
-	pkt_dbg(2, "entering\n");
-
 	mutex_lock(&pktcdvd_mutex);
 	mutex_lock(&ctl_mutex);
 	pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
@@ -2313,7 +2308,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
 out_dec:
 	pd->refcnt--;
 out:
-	pkt_dbg(2, "failed (%d)\n", ret);
 	mutex_unlock(&ctl_mutex);
 	mutex_unlock(&pktcdvd_mutex);
 	return ret;
@@ -2394,7 +2388,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	blk_queue_bounce(q, &bio);
 
 	zone = get_zone(bio->bi_sector, pd);
-	pkt_dbg(2, "start = %6llx stop = %6llx\n",
+	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_sector,
 		(unsigned long long)bio_end_sector(bio));
 
@@ -2649,7 +2643,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	}
 
 	proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd);
-	pkt_dbg(1, "writer %s mapped to %s\n", pd->name, bdevname(bdev, b));
+	pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b));
 	return 0;
 
 out_mem:
@@ -2664,7 +2658,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 	struct pktcdvd_device *pd = bdev->bd_disk->private_data;
 	int ret;
 
-	pkt_dbg(2, "cmd %x, dev %d:%d\n",
+	pkt_dbg(2, pd, "cmd %x, dev %d:%d\n",
 		cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
 
 	mutex_lock(&pktcdvd_mutex);
@@ -2689,7 +2683,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		break;
 
 	default:
-		pkt_dbg(2, "Unknown ioctl for %s (%x)\n", pd->name, cmd);
+		pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
 		ret = -ENOTTY;
 	}
 	mutex_unlock(&pktcdvd_mutex);
@@ -2838,7 +2832,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 			break;
 	}
 	if (idx == MAX_WRITERS) {
-		pkt_dbg(1, "dev not setup\n");
+		pkt_dbg(1, pd, "dev not setup\n");
 		ret = -ENXIO;
 		goto out;
 	}
@@ -2858,7 +2852,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 	blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
 
 	remove_proc_entry(pd->name, pkt_proc);
-	pkt_dbg(1, "writer %s unmapped\n", pd->name);
+	pkt_dbg(1, pd, "writer unmapped\n");
 
 	del_gendisk(pd->disk);
 	blk_cleanup_queue(pd->disk->queue);

From fa63c0ab81f55eb5a016c1bcea04fe39c14afbaa Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:55 -0700
Subject: [PATCH 272/303] pktcdvd: add struct pktcdvd_device.name to pr_err
 logging where possible

Add a new pkt_err macro to prefix the name to the logging output.  Convert
pr_err where there is a non-null struct pktcdvd_device.

Includes improvements from Andy Shevchenko.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Andy Shevchenko <andy.shevchenko@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 44 ++++++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 5050c7abb6ee..3dbe42fe1bdd 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -71,6 +71,9 @@
 
 #define DRIVER_NAME	"pktcdvd"
 
+#define pkt_err(pd, fmt, ...)						\
+	pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
+
 #define pkt_dbg(level, pd, fmt, ...)					\
 do {									\
 	if (level == 2 && PACKET_DEBUG >= 2)				\
@@ -938,7 +941,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
 		set_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
 	} else {
-		pr_err("cdrom max_phys_segments too small\n");
+		pkt_err(pd, "cdrom max_phys_segments too small\n");
 		return -EIO;
 	}
 }
@@ -1743,7 +1746,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 		/*
 		 * paranoia
 		 */
-		pr_err("write mode wrong %d\n", wp->data_block_type);
+		pkt_err(pd, "write mode wrong %d\n", wp->data_block_type);
 		return 1;
 	}
 	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
@@ -1787,7 +1790,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
 	if (ti->rt == 1 && ti->blank == 0)
 		return 1;
 
-	pr_err("bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+	pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
 	return 0;
 }
 
@@ -1820,7 +1823,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 	}
 
 	if (di->disc_type != 0x20 && di->disc_type != 0) {
-		pr_err("wrong disc type (%x)\n", di->disc_type);
+		pkt_err(pd, "wrong disc type (%x)\n", di->disc_type);
 		return 0;
 	}
 
@@ -1830,7 +1833,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 	}
 
 	if (di->border_status == PACKET_SESSION_RESERVED) {
-		pr_err("can't write to last track (reserved)\n");
+		pkt_err(pd, "can't write to last track (reserved)\n");
 		return 0;
 	}
 
@@ -1855,7 +1858,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 	memset(&ti, 0, sizeof(track_information));
 
 	if ((ret = pkt_get_disc_info(pd, &di))) {
-		pr_err("failed get_disc\n");
+		pkt_err(pd, "failed get_disc\n");
 		return ret;
 	}
 
@@ -1866,12 +1869,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 
 	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
 	if ((ret = pkt_get_track_info(pd, track, 1, &ti))) {
-		pr_err("failed get_track\n");
+		pkt_err(pd, "failed get_track\n");
 		return ret;
 	}
 
 	if (!pkt_writable_track(pd, &ti)) {
-		pr_err("can't write to this track\n");
+		pkt_err(pd, "can't write to this track\n");
 		return -EROFS;
 	}
 
@@ -1885,7 +1888,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 		return -ENXIO;
 	}
 	if (pd->settings.size > PACKET_MAX_SECTORS) {
-		pr_err("packet size is too big\n");
+		pkt_err(pd, "packet size is too big\n");
 		return -EROFS;
 	}
 	pd->settings.fp = ti.fp;
@@ -1927,7 +1930,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 			pd->settings.block_mode = PACKET_BLOCK_MODE2;
 			break;
 		default:
-			pr_err("unknown data mode\n");
+			pkt_err(pd, "unknown data mode\n");
 			return -EROFS;
 	}
 	return 0;
@@ -1961,7 +1964,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
 	ret = pkt_mode_select(pd, &cgc);
 	if (ret) {
-		pr_err("write caching control failed\n");
+		pkt_err(pd, "write caching control failed\n");
 		pkt_dump_sense(&cgc);
 	} else if (!ret && set)
 		pr_notice("enabled write caching on %s\n", pd->name);
@@ -2200,7 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 		goto out;
 
 	if ((ret = pkt_get_last_written(pd, &lba))) {
-		pr_err("pkt_get_last_written failed\n");
+		pkt_err(pd, "pkt_get_last_written failed\n");
 		goto out_putdev;
 	}
 
@@ -2230,7 +2233,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 
 	if (write) {
 		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
-			pr_err("not enough memory for buffers\n");
+			pkt_err(pd, "not enough memory for buffers\n");
 			ret = -ENOMEM;
 			goto out_putdev;
 		}
@@ -2352,8 +2355,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	pd = q->queuedata;
 	if (!pd) {
-		pr_err("%s incorrect request queue\n",
-		       bdevname(bio->bi_bdev, b));
+		pkt_err(pd, "%s incorrect request queue\n",
+			bdevname(bio->bi_bdev, b));
 		goto end_io;
 	}
 
@@ -2381,7 +2384,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) {
-		pr_err("wrong bio size\n");
+		pkt_err(pd, "wrong bio size\n");
 		goto end_io;
 	}
 
@@ -2602,7 +2605,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	struct block_device *bdev;
 
 	if (pd->pkt_dev == dev) {
-		pr_err("recursive setup not allowed\n");
+		pkt_err(pd, "recursive setup not allowed\n");
 		return -EBUSY;
 	}
 	for (i = 0; i < MAX_WRITERS; i++) {
@@ -2610,11 +2613,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 		if (!pd2)
 			continue;
 		if (pd2->bdev->bd_dev == dev) {
-			pr_err("%s already setup\n", bdevname(pd2->bdev, b));
+			pkt_err(pd, "%s already setup\n",
+				bdevname(pd2->bdev, b));
 			return -EBUSY;
 		}
 		if (pd2->pkt_dev == dev) {
-			pr_err("can't chain pktcdvd devices\n");
+			pkt_err(pd, "can't chain pktcdvd devices\n");
 			return -EBUSY;
 		}
 	}
@@ -2637,7 +2641,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
 	atomic_set(&pd->cdrw.pending_bios, 0);
 	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
 	if (IS_ERR(pd->cdrw.thread)) {
-		pr_err("can't start kernel thread\n");
+		pkt_err(pd, "can't start kernel thread\n");
 		ret = -ENOMEM;
 		goto out_mem;
 	}

From ca73dabc3d1df4aaacb242b6d4e39cd36618444f Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:56 -0700
Subject: [PATCH 273/303] pktcdvd: convert pr_notice to pkt_notice

Add a new pkt_notice macro to prefix the name to the logging output.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 3dbe42fe1bdd..136a04c3a07a 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -73,6 +73,8 @@
 
 #define pkt_err(pd, fmt, ...)						\
 	pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_notice(pd, fmt, ...)					\
+	pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
 
 #define pkt_dbg(level, pd, fmt, ...)					\
 do {									\
@@ -1818,7 +1820,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 	 * but i'm not sure, should we leave this to user apps? probably.
 	 */
 	if (di->disc_type == 0xff) {
-		pr_notice("unknown disc - no track?\n");
+		pkt_notice(pd, "unknown disc - no track?\n");
 		return 0;
 	}
 
@@ -1828,7 +1830,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
 	}
 
 	if (di->erasable == 0) {
-		pr_notice("disc not erasable\n");
+		pkt_notice(pd, "disc not erasable\n");
 		return 0;
 	}
 
@@ -1884,7 +1886,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
 	 */
 	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
 	if (pd->settings.size == 0) {
-		pr_notice("detected zero packet size!\n");
+		pkt_notice(pd, "detected zero packet size!\n");
 		return -ENXIO;
 	}
 	if (pd->settings.size > PACKET_MAX_SECTORS) {
@@ -1967,7 +1969,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 		pkt_err(pd, "write caching control failed\n");
 		pkt_dump_sense(&cgc);
 	} else if (!ret && set)
-		pr_notice("enabled write caching on %s\n", pd->name);
+		pkt_notice(pd, "enabled write caching\n");
 	return ret;
 }
 
@@ -2082,11 +2084,11 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 	}
 
 	if (!(buf[6] & 0x40)) {
-		pr_notice("disc type is not CD-RW\n");
+		pkt_notice(pd, "disc type is not CD-RW\n");
 		return 1;
 	}
 	if (!(buf[6] & 0x4)) {
-		pr_notice("A1 values on media are not valid, maybe not CDRW?\n");
+		pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n");
 		return 1;
 	}
 
@@ -2106,14 +2108,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 			*speed = us_clv_to_speed[sp];
 			break;
 		default:
-			pr_notice("unknown disc sub-type %d\n", st);
+			pkt_notice(pd, "unknown disc sub-type %d\n", st);
 			return 1;
 	}
 	if (*speed) {
 		pr_info("maximum media speed: %d\n", *speed);
 		return 0;
 	} else {
-		pr_notice("unknown speed %d for sub-type %d\n", sp, st);
+		pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
 		return 1;
 	}
 }
@@ -2378,8 +2380,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	}
 
 	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
-		pr_notice("WRITE for ro device %s (%llu)\n",
-			  pd->name, (unsigned long long)bio->bi_sector);
+		pkt_notice(pd, "WRITE for ro device (%llu)\n",
+			   (unsigned long long)bio->bi_sector);
 		goto end_io;
 	}
 

From 0c075d64df3aa9636a8700a710a2f2ada5e453a2 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:57 -0700
Subject: [PATCH 274/303] pktcdvd: convert pr_info to pkt_info

Add a new pkt_info macro to prefix the name to the logging output.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 136a04c3a07a..1ceafb70fc16 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -75,6 +75,8 @@
 	pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
 #define pkt_notice(pd, fmt, ...)					\
 	pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_info(pd, fmt, ...)						\
+	pr_info("%s: " fmt, pd->name, ##__VA_ARGS__)
 
 #define pkt_dbg(level, pd, fmt, ...)					\
 do {									\
@@ -1561,10 +1563,10 @@ work_to_do:
 
 static void pkt_print_settings(struct pktcdvd_device *pd)
 {
-	pr_info("%s packets, %u blocks, Mode-%c disc\n",
-		pd->settings.fp ? "Fixed" : "Variable",
-		pd->settings.size >> 2,
-		pd->settings.block_mode == 8 ? '1' : '2');
+	pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n",
+		 pd->settings.fp ? "Fixed" : "Variable",
+		 pd->settings.size >> 2,
+		 pd->settings.block_mode == 8 ? '1' : '2');
 }
 
 static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
@@ -2112,7 +2114,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 			return 1;
 	}
 	if (*speed) {
-		pr_info("maximum media speed: %d\n", *speed);
+		pkt_info(pd, "maximum media speed: %d\n", *speed);
 		return 0;
 	} else {
 		pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
@@ -2239,7 +2241,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
 			ret = -ENOMEM;
 			goto out_putdev;
 		}
-		pr_info("%lukB available on disc\n", lba << 1);
+		pkt_info(pd, "%lukB available on disc\n", lba << 1);
 	}
 
 	return 0;

From f3ded788bbd95af344a34ae40d1ab578cb76c7fb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:58 -0700
Subject: [PATCH 275/303] pktcdvd: add struct pktcdvd_device * to
 pkt_dump_sense()

Allow the device name to be emitted with pkt_err when logging the sense
data.

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 1ceafb70fc16..29a5194ab147 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -755,17 +755,18 @@ static const char *sense_key_string(__u8 index)
  * A generic sense dump / resolve mechanism should be implemented across
  * all ATAPI + SCSI devices.
  */
-static void pkt_dump_sense(struct packet_command *cgc)
+static void pkt_dump_sense(struct pktcdvd_device *pd,
+			   struct packet_command *cgc)
 {
 	struct request_sense *sense = cgc->sense;
 
 	if (sense)
-		pr_err("%*ph - sense %02x.%02x.%02x (%s)\n",
-		       CDROM_PACKET_SIZE, cgc->cmd,
-		       sense->sense_key, sense->asc, sense->ascq,
-		       sense_key_string(sense->sense_key));
+		pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
+			CDROM_PACKET_SIZE, cgc->cmd,
+			sense->sense_key, sense->asc, sense->ascq,
+			sense_key_string(sense->sense_key));
 	else
-		pr_err("%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
+		pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
 }
 
 /*
@@ -808,7 +809,7 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
 	cgc.cmd[5] = write_speed & 0xff;
 
 	if ((ret = pkt_generic_packet(pd, &cgc)))
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 
 	return ret;
 }
@@ -1700,7 +1701,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
 	cgc.sense = &sense;
 	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -1715,7 +1716,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
 	cgc.sense = &sense;
 	if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -1757,7 +1758,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
 
 	cgc.buflen = cgc.cmd[8] = size;
 	if ((ret = pkt_mode_select(pd, &cgc))) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -1969,7 +1970,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
 	ret = pkt_mode_select(pd, &cgc);
 	if (ret) {
 		pkt_err(pd, "write caching control failed\n");
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 	} else if (!ret && set)
 		pkt_notice(pd, "enabled write caching\n");
 	return ret;
@@ -2007,7 +2008,7 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
 			     sizeof(struct mode_page_header);
 		ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
 		if (ret) {
-			pkt_dump_sense(&cgc);
+			pkt_dump_sense(pd, &cgc);
 			return ret;
 		}
 	}
@@ -2066,7 +2067,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 	cgc.cmd[8] = 2;
 	ret = pkt_generic_packet(pd, &cgc);
 	if (ret) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 	size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
@@ -2081,7 +2082,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
 	cgc.cmd[8] = size;
 	ret = pkt_generic_packet(pd, &cgc);
 	if (ret) {
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 		return ret;
 	}
 
@@ -2136,7 +2137,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
 	cgc.cmd[0] = GPCMD_SEND_OPC;
 	cgc.cmd[1] = 1;
 	if ((ret = pkt_generic_packet(pd, &cgc)))
-		pkt_dump_sense(&cgc);
+		pkt_dump_sense(pd, &cgc);
 	return ret;
 }
 

From 666dc7c90ab2531f7f0f13ef21afab8cd61478a9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Wed, 11 Sep 2013 14:25:59 -0700
Subject: [PATCH 276/303] pktcdvd: fix defective misuses of pkt_<level>

Fix thinkos where pkt_<level> needs a valid pktcdvd_device * and the
pointer is known to be NULL.

Signed-off-by: Joe Perches <joe@perches.com>
Reported-by: Dan Carpenter <dan.carpenter@oracle.com> (go smatch!)
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/pktcdvd.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 29a5194ab147..56188475cfd3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2360,8 +2360,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 
 	pd = q->queuedata;
 	if (!pd) {
-		pkt_err(pd, "%s incorrect request queue\n",
-			bdevname(bio->bi_bdev, b));
+		pr_err("%s incorrect request queue\n",
+		       bdevname(bio->bi_bdev, b));
 		goto end_io;
 	}
 
@@ -2841,7 +2841,7 @@ static int pkt_remove_dev(dev_t pkt_dev)
 			break;
 	}
 	if (idx == MAX_WRITERS) {
-		pkt_dbg(1, pd, "dev not setup\n");
+		pr_debug("dev not setup\n");
 		ret = -ENXIO;
 		goto out;
 	}

From b67fb086f38c67c0b940d9c2661f14b44c39e67a Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:26:00 -0700
Subject: [PATCH 277/303] drivers/pps/clients/pps-gpio.c: remove unnecessary
 platform_set_drvdata()

The driver core clears the driver data to NULL after device_release or on
probe failure.  Thus, it is not needed to manually clear the device driver
data to NULL.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Rodolfo Giometti <giometti@enneenne.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/pps/clients/pps-gpio.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c
index eae0eda9ff39..9966124ad988 100644
--- a/drivers/pps/clients/pps-gpio.c
+++ b/drivers/pps/clients/pps-gpio.c
@@ -184,7 +184,6 @@ static int pps_gpio_remove(struct platform_device *pdev)
 {
 	struct pps_gpio_device_data *data = platform_get_drvdata(pdev);
 
-	platform_set_drvdata(pdev, NULL);
 	pps_unregister_source(data->pps);
 	dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq);
 	return 0;

From ccf5a04f70cc2f64c145727de0f049cc67c43ece Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:26:01 -0700
Subject: [PATCH 278/303] drivers/memstick/host/rtsx_pci_ms.c: remove
 unnecessary platform_set_drvdata()

The driver core clears the driver data to NULL after device_release or on
probe failure.  Thus, it is not needed to manually clear the device driver
data to NULL.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Maxim Levitsky <maximlevitsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/memstick/host/rtsx_pci_ms.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c
index cf8bd727dfc7..25f8f93decb6 100644
--- a/drivers/memstick/host/rtsx_pci_ms.c
+++ b/drivers/memstick/host/rtsx_pci_ms.c
@@ -612,8 +612,6 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev)
 	memstick_remove_host(msh);
 	memstick_free_host(msh);
 
-	platform_set_drvdata(pdev, NULL);
-
 	dev_dbg(&(pdev->dev),
 		": Realtek PCI-E Memstick controller has been removed\n");
 

From 0ab30494bc4f3bc1ea4659b7c5d97c5218554a63 Mon Sep 17 00:00:00 2001
From: Maxim Levitsky <maximlevitsky@gmail.com>
Date: Wed, 11 Sep 2013 14:26:02 -0700
Subject: [PATCH 279/303] memstick: add support for legacy memorysticks

Based partially on MS standard spec quotes from Alex Dubov.

As any code that works with user data this driver isn't recommended to use
to write cards that contain valuable data.

It tries its best though to avoid data corruption and possible damage to
the card.

Tested on MS DUO 64 MB card on Ricoh R592 card reader.

Signed-off-by: Maxim Levitsky <maximlevitsky@gmail.com>
Cc: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                      |    5 +
 drivers/memstick/core/Kconfig    |   12 +
 drivers/memstick/core/Makefile   |    2 +-
 drivers/memstick/core/ms_block.c | 2385 ++++++++++++++++++++++++++++++
 drivers/memstick/core/ms_block.h |  290 ++++
 5 files changed, 2693 insertions(+), 1 deletion(-)
 create mode 100644 drivers/memstick/core/ms_block.c
 create mode 100644 drivers/memstick/core/ms_block.h

diff --git a/MAINTAINERS b/MAINTAINERS
index f8c41ae6c9a4..55969f1f626f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7749,6 +7749,11 @@ W:	http://tifmxx.berlios.de/
 S:	Maintained
 F:	drivers/memstick/host/tifm_ms.c
 
+SONY MEMORYSTICK STANDARD SUPPORT
+M:	Maxim Levitsky <maximlevitsky@gmail.com>
+S:	Maintained
+F:	drivers/memstick/core/ms_block.*
+
 SOUND
 M:	Jaroslav Kysela <perex@perex.cz>
 M:	Takashi Iwai <tiwai@suse.de>
diff --git a/drivers/memstick/core/Kconfig b/drivers/memstick/core/Kconfig
index 95f1814b5368..1d389491d5fd 100644
--- a/drivers/memstick/core/Kconfig
+++ b/drivers/memstick/core/Kconfig
@@ -24,3 +24,15 @@ config MSPRO_BLOCK
 	  support. This provides a block device driver, which you can use
 	  to mount the filesystem. Almost everyone wishing MemoryStick
 	  support should say Y or M here.
+
+config MS_BLOCK
+	tristate "MemoryStick Standard device driver"
+	depends on BLOCK
+	help
+	  Say Y here to enable the MemoryStick Standard device driver
+	  support. This provides a block device driver, which you can use
+	  to mount the filesystem.
+	  This driver works with old (bulky) MemoryStick and MemoryStick Duo
+	  but not PRO. Say Y if you have such card.
+	  Driver is new and not yet well tested, thus it can damage your card
+	  (even permanently)
diff --git a/drivers/memstick/core/Makefile b/drivers/memstick/core/Makefile
index ecd029937738..0d7f90c0ff25 100644
--- a/drivers/memstick/core/Makefile
+++ b/drivers/memstick/core/Makefile
@@ -3,5 +3,5 @@
 #
 
 obj-$(CONFIG_MEMSTICK)		+= memstick.o
-
+obj-$(CONFIG_MS_BLOCK)		+= ms_block.o
 obj-$(CONFIG_MSPRO_BLOCK)	+= mspro_block.o
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
new file mode 100644
index 000000000000..08e70232062f
--- /dev/null
+++ b/drivers/memstick/core/ms_block.c
@@ -0,0 +1,2385 @@
+/*
+ *  ms_block.c - Sony MemoryStick (legacy) storage support
+
+ *  Copyright (C) 2013 Maxim Levitsky <maximlevitsky@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Minor portions of the driver were copied from mspro_block.c which is
+ * Copyright (C) 2007 Alex Dubov <oakad@yahoo.com>
+ *
+ */
+#define DRIVER_NAME "ms_block"
+#define pr_fmt(fmt) DRIVER_NAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/memstick.h>
+#include <linux/idr.h>
+#include <linux/hdreg.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/bitmap.h>
+#include <linux/scatterlist.h>
+#include <linux/jiffies.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include "ms_block.h"
+
+static int debug;
+static int cache_flush_timeout = 1000;
+static bool verify_writes;
+
+/*
+ * Copies section of 'sg_from' starting from offset 'offset' and with length
+ * 'len' To another scatterlist of to_nents enties
+ */
+static size_t msb_sg_copy(struct scatterlist *sg_from,
+	struct scatterlist *sg_to, int to_nents, size_t offset, size_t len)
+{
+	size_t copied = 0;
+
+	while (offset > 0) {
+		if (offset >= sg_from->length) {
+			if (sg_is_last(sg_from))
+				return 0;
+
+			offset -= sg_from->length;
+			sg_from = sg_next(sg_from);
+			continue;
+		}
+
+		copied = min(len, sg_from->length - offset);
+		sg_set_page(sg_to, sg_page(sg_from),
+			copied, sg_from->offset + offset);
+
+		len -= copied;
+		offset = 0;
+
+		if (sg_is_last(sg_from) || !len)
+			goto out;
+
+		sg_to = sg_next(sg_to);
+		to_nents--;
+		sg_from = sg_next(sg_from);
+	}
+
+	while (len > sg_from->length && to_nents--) {
+		len -= sg_from->length;
+		copied += sg_from->length;
+
+		sg_set_page(sg_to, sg_page(sg_from),
+				sg_from->length, sg_from->offset);
+
+		if (sg_is_last(sg_from) || !len)
+			goto out;
+
+		sg_from = sg_next(sg_from);
+		sg_to = sg_next(sg_to);
+	}
+
+	if (len && to_nents) {
+		sg_set_page(sg_to, sg_page(sg_from), len, sg_from->offset);
+		copied += len;
+	}
+out:
+	sg_mark_end(sg_to);
+	return copied;
+}
+
+/*
+ * Compares section of 'sg' starting from offset 'offset' and with length 'len'
+ * to linear buffer of length 'len' at address 'buffer'
+ * Returns 0 if equal and  -1 otherwice
+ */
+static int msb_sg_compare_to_buffer(struct scatterlist *sg,
+					size_t offset, u8 *buffer, size_t len)
+{
+	int retval = 0, cmplen;
+	struct sg_mapping_iter miter;
+
+	sg_miter_start(&miter, sg, sg_nents(sg),
+					SG_MITER_ATOMIC | SG_MITER_FROM_SG);
+
+	while (sg_miter_next(&miter) && len > 0) {
+		if (offset >= miter.length) {
+			offset -= miter.length;
+			continue;
+		}
+
+		cmplen = min(miter.length - offset, len);
+		retval = memcmp(miter.addr + offset, buffer, cmplen) ? -1 : 0;
+		if (retval)
+			break;
+
+		buffer += cmplen;
+		len -= cmplen;
+		offset = 0;
+	}
+
+	if (!retval && len)
+		retval = -1;
+
+	sg_miter_stop(&miter);
+	return retval;
+}
+
+
+/* Get zone at which block with logical address 'lba' lives
+ * Flash is broken into zones.
+ * Each zone consists of 512 eraseblocks, out of which in first
+ * zone 494 are used and 496 are for all following zones.
+ * Therefore zone #0 hosts blocks 0-493, zone #1 blocks 494-988, etc...
+*/
+static int msb_get_zone_from_lba(int lba)
+{
+	if (lba < 494)
+		return 0;
+	return ((lba - 494) / 496) + 1;
+}
+
+/* Get zone of physical block. Trivial */
+static int msb_get_zone_from_pba(int pba)
+{
+	return pba / MS_BLOCKS_IN_ZONE;
+}
+
+/* Debug test to validate free block counts */
+static int msb_validate_used_block_bitmap(struct msb_data *msb)
+{
+	int total_free_blocks = 0;
+	int i;
+
+	if (!debug)
+		return 0;
+
+	for (i = 0; i < msb->zone_count; i++)
+		total_free_blocks += msb->free_block_count[i];
+
+	if (msb->block_count - bitmap_weight(msb->used_blocks_bitmap,
+					msb->block_count) == total_free_blocks)
+		return 0;
+
+	pr_err("BUG: free block counts don't match the bitmap");
+	msb->read_only = true;
+	return -EINVAL;
+}
+
+/* Mark physical block as used */
+static void msb_mark_block_used(struct msb_data *msb, int pba)
+{
+	int zone = msb_get_zone_from_pba(pba);
+
+	if (test_bit(pba, msb->used_blocks_bitmap)) {
+		pr_err(
+		"BUG: attempt to mark already used pba %d as used", pba);
+		msb->read_only = true;
+		return;
+	}
+
+	if (msb_validate_used_block_bitmap(msb))
+		return;
+
+	/* No races because all IO is single threaded */
+	__set_bit(pba, msb->used_blocks_bitmap);
+	msb->free_block_count[zone]--;
+}
+
+/* Mark physical block as free */
+static void msb_mark_block_unused(struct msb_data *msb, int pba)
+{
+	int zone = msb_get_zone_from_pba(pba);
+
+	if (!test_bit(pba, msb->used_blocks_bitmap)) {
+		pr_err("BUG: attempt to mark already unused pba %d as unused" , pba);
+		msb->read_only = true;
+		return;
+	}
+
+	if (msb_validate_used_block_bitmap(msb))
+		return;
+
+	/* No races because all IO is single threaded */
+	__clear_bit(pba, msb->used_blocks_bitmap);
+	msb->free_block_count[zone]++;
+}
+
+/* Invalidate current register window */
+static void msb_invalidate_reg_window(struct msb_data *msb)
+{
+	msb->reg_addr.w_offset = offsetof(struct ms_register, id);
+	msb->reg_addr.w_length = sizeof(struct ms_id_register);
+	msb->reg_addr.r_offset = offsetof(struct ms_register, id);
+	msb->reg_addr.r_length = sizeof(struct ms_id_register);
+	msb->addr_valid = false;
+}
+
+/* Start a state machine */
+static int msb_run_state_machine(struct msb_data *msb, int   (*state_func)
+		(struct memstick_dev *card, struct memstick_request **req))
+{
+	struct memstick_dev *card = msb->card;
+
+	WARN_ON(msb->state != -1);
+	msb->int_polling = false;
+	msb->state = 0;
+	msb->exit_error = 0;
+
+	memset(&card->current_mrq, 0, sizeof(card->current_mrq));
+
+	card->next_request = state_func;
+	memstick_new_req(card->host);
+	wait_for_completion(&card->mrq_complete);
+
+	WARN_ON(msb->state != -1);
+	return msb->exit_error;
+}
+
+/* State machines call that to exit */
+static int msb_exit_state_machine(struct msb_data *msb, int error)
+{
+	WARN_ON(msb->state == -1);
+
+	msb->state = -1;
+	msb->exit_error = error;
+	msb->card->next_request = h_msb_default_bad;
+
+	/* Invalidate reg window on errors */
+	if (error)
+		msb_invalidate_reg_window(msb);
+
+	complete(&msb->card->mrq_complete);
+	return -ENXIO;
+}
+
+/* read INT register */
+static int msb_read_int_reg(struct msb_data *msb, long timeout)
+{
+	struct memstick_request *mrq = &msb->card->current_mrq;
+
+	WARN_ON(msb->state == -1);
+
+	if (!msb->int_polling) {
+		msb->int_timeout = jiffies +
+			msecs_to_jiffies(timeout == -1 ? 500 : timeout);
+		msb->int_polling = true;
+	} else if (time_after(jiffies, msb->int_timeout)) {
+		mrq->data[0] = MEMSTICK_INT_CMDNAK;
+		return 0;
+	}
+
+	if ((msb->caps & MEMSTICK_CAP_AUTO_GET_INT) &&
+				mrq->need_card_int && !mrq->error) {
+		mrq->data[0] = mrq->int_reg;
+		mrq->need_card_int = false;
+		return 0;
+	} else {
+		memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1);
+		return 1;
+	}
+}
+
+/* Read a register */
+static int msb_read_regs(struct msb_data *msb, int offset, int len)
+{
+	struct memstick_request *req = &msb->card->current_mrq;
+
+	if (msb->reg_addr.r_offset != offset ||
+	    msb->reg_addr.r_length != len || !msb->addr_valid) {
+
+		msb->reg_addr.r_offset = offset;
+		msb->reg_addr.r_length = len;
+		msb->addr_valid = true;
+
+		memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS,
+			&msb->reg_addr, sizeof(msb->reg_addr));
+		return 0;
+	}
+
+	memstick_init_req(req, MS_TPC_READ_REG, NULL, len);
+	return 1;
+}
+
+/* Write a card register */
+static int msb_write_regs(struct msb_data *msb, int offset, int len, void *buf)
+{
+	struct memstick_request *req = &msb->card->current_mrq;
+
+	if (msb->reg_addr.w_offset != offset ||
+		msb->reg_addr.w_length != len  || !msb->addr_valid) {
+
+		msb->reg_addr.w_offset = offset;
+		msb->reg_addr.w_length = len;
+		msb->addr_valid = true;
+
+		memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS,
+			&msb->reg_addr, sizeof(msb->reg_addr));
+		return 0;
+	}
+
+	memstick_init_req(req, MS_TPC_WRITE_REG, buf, len);
+	return 1;
+}
+
+/* Handler for absence of IO */
+static int h_msb_default_bad(struct memstick_dev *card,
+						struct memstick_request **mrq)
+{
+	return -ENXIO;
+}
+
+/*
+ * This function is a handler for reads of one page from device.
+ * Writes output to msb->current_sg, takes sector address from msb->reg.param
+ * Can also be used to read extra data only. Set params accordintly.
+ */
+static int h_msb_read_page(struct memstick_dev *card,
+					struct memstick_request **out_mrq)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_request *mrq = *out_mrq = &card->current_mrq;
+	struct scatterlist sg[2];
+	u8 command, intreg;
+
+	if (mrq->error) {
+		dbg("read_page, unknown error");
+		return msb_exit_state_machine(msb, mrq->error);
+	}
+again:
+	switch (msb->state) {
+	case MSB_RP_SEND_BLOCK_ADDRESS:
+		/* msb_write_regs sometimes "fails" because it needs to update
+			the reg window, and thus it returns request for that.
+			Then we stay in this state and retry */
+		if (!msb_write_regs(msb,
+			offsetof(struct ms_register, param),
+			sizeof(struct ms_param_register),
+			(unsigned char *)&msb->regs.param))
+			return 0;
+
+		msb->state = MSB_RP_SEND_READ_COMMAND;
+		return 0;
+
+	case MSB_RP_SEND_READ_COMMAND:
+		command = MS_CMD_BLOCK_READ;
+		memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1);
+		msb->state = MSB_RP_SEND_INT_REQ;
+		return 0;
+
+	case MSB_RP_SEND_INT_REQ:
+		msb->state = MSB_RP_RECEIVE_INT_REQ_RESULT;
+		/* If dont actually need to send the int read request (only in
+			serial mode), then just fall through */
+		if (msb_read_int_reg(msb, -1))
+			return 0;
+		/* fallthrough */
+
+	case MSB_RP_RECEIVE_INT_REQ_RESULT:
+		intreg = mrq->data[0];
+		msb->regs.status.interrupt = intreg;
+
+		if (intreg & MEMSTICK_INT_CMDNAK)
+			return msb_exit_state_machine(msb, -EIO);
+
+		if (!(intreg & MEMSTICK_INT_CED)) {
+			msb->state = MSB_RP_SEND_INT_REQ;
+			goto again;
+		}
+
+		msb->int_polling = false;
+		msb->state = (intreg & MEMSTICK_INT_ERR) ?
+			MSB_RP_SEND_READ_STATUS_REG : MSB_RP_SEND_OOB_READ;
+		goto again;
+
+	case MSB_RP_SEND_READ_STATUS_REG:
+		 /* read the status register to understand source of the INT_ERR */
+		if (!msb_read_regs(msb,
+			offsetof(struct ms_register, status),
+			sizeof(struct ms_status_register)))
+			return 0;
+
+		msb->state = MSB_RP_RECEIVE_OOB_READ;
+		return 0;
+
+	case MSB_RP_RECIVE_STATUS_REG:
+		msb->regs.status = *(struct ms_status_register *)mrq->data;
+		msb->state = MSB_RP_SEND_OOB_READ;
+		/* fallthrough */
+
+	case MSB_RP_SEND_OOB_READ:
+		if (!msb_read_regs(msb,
+			offsetof(struct ms_register, extra_data),
+			sizeof(struct ms_extra_data_register)))
+			return 0;
+
+		msb->state = MSB_RP_RECEIVE_OOB_READ;
+		return 0;
+
+	case MSB_RP_RECEIVE_OOB_READ:
+		msb->regs.extra_data =
+			*(struct ms_extra_data_register *) mrq->data;
+		msb->state = MSB_RP_SEND_READ_DATA;
+		/* fallthrough */
+
+	case MSB_RP_SEND_READ_DATA:
+		/* Skip that state if we only read the oob */
+		if (msb->regs.param.cp == MEMSTICK_CP_EXTRA) {
+			msb->state = MSB_RP_RECEIVE_READ_DATA;
+			goto again;
+		}
+
+		sg_init_table(sg, ARRAY_SIZE(sg));
+		msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg),
+			msb->current_sg_offset,
+			msb->page_size);
+
+		memstick_init_req_sg(mrq, MS_TPC_READ_LONG_DATA, sg);
+		msb->state = MSB_RP_RECEIVE_READ_DATA;
+		return 0;
+
+	case MSB_RP_RECEIVE_READ_DATA:
+		if (!(msb->regs.status.interrupt & MEMSTICK_INT_ERR)) {
+			msb->current_sg_offset += msb->page_size;
+			return msb_exit_state_machine(msb, 0);
+		}
+
+		if (msb->regs.status.status1 & MEMSTICK_UNCORR_ERROR) {
+			dbg("read_page: uncorrectable error");
+			return msb_exit_state_machine(msb, -EBADMSG);
+		}
+
+		if (msb->regs.status.status1 & MEMSTICK_CORR_ERROR) {
+			dbg("read_page: correctable error");
+			msb->current_sg_offset += msb->page_size;
+			return msb_exit_state_machine(msb, -EUCLEAN);
+		} else {
+			dbg("read_page: INT error, but no status error bits");
+			return msb_exit_state_machine(msb, -EIO);
+		}
+	}
+
+	BUG();
+}
+
+/*
+ * Handler of writes of exactly one block.
+ * Takes address from msb->regs.param.
+ * Writes same extra data to blocks, also taken
+ * from msb->regs.extra
+ * Returns -EBADMSG if write fails due to uncorrectable error, or -EIO if
+ * device refuses to take the command or something else
+ */
+static int h_msb_write_block(struct memstick_dev *card,
+					struct memstick_request **out_mrq)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_request *mrq = *out_mrq = &card->current_mrq;
+	struct scatterlist sg[2];
+	u8 intreg, command;
+
+	if (mrq->error)
+		return msb_exit_state_machine(msb, mrq->error);
+
+again:
+	switch (msb->state) {
+
+	/* HACK: Jmicon handling of TPCs between 8 and
+	 *	sizeof(memstick_request.data) is broken due to hardware
+	 *	bug in PIO mode that is used for these TPCs
+	 *	Therefore split the write
+	 */
+
+	case MSB_WB_SEND_WRITE_PARAMS:
+		if (!msb_write_regs(msb,
+			offsetof(struct ms_register, param),
+			sizeof(struct ms_param_register),
+			&msb->regs.param))
+			return 0;
+
+		msb->state = MSB_WB_SEND_WRITE_OOB;
+		return 0;
+
+	case MSB_WB_SEND_WRITE_OOB:
+		if (!msb_write_regs(msb,
+			offsetof(struct ms_register, extra_data),
+			sizeof(struct ms_extra_data_register),
+			&msb->regs.extra_data))
+			return 0;
+		msb->state = MSB_WB_SEND_WRITE_COMMAND;
+		return 0;
+
+
+	case MSB_WB_SEND_WRITE_COMMAND:
+		command = MS_CMD_BLOCK_WRITE;
+		memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1);
+		msb->state = MSB_WB_SEND_INT_REQ;
+		return 0;
+
+	case MSB_WB_SEND_INT_REQ:
+		msb->state = MSB_WB_RECEIVE_INT_REQ;
+		if (msb_read_int_reg(msb, -1))
+			return 0;
+		/* fallthrough */
+
+	case MSB_WB_RECEIVE_INT_REQ:
+		intreg = mrq->data[0];
+		msb->regs.status.interrupt = intreg;
+
+		/* errors mean out of here, and fast... */
+		if (intreg & (MEMSTICK_INT_CMDNAK))
+			return msb_exit_state_machine(msb, -EIO);
+
+		if (intreg & MEMSTICK_INT_ERR)
+			return msb_exit_state_machine(msb, -EBADMSG);
+
+
+		/* for last page we need to poll CED */
+		if (msb->current_page == msb->pages_in_block) {
+			if (intreg & MEMSTICK_INT_CED)
+				return msb_exit_state_machine(msb, 0);
+			msb->state = MSB_WB_SEND_INT_REQ;
+			goto again;
+
+		}
+
+		/* for non-last page we need BREQ before writing next chunk */
+		if (!(intreg & MEMSTICK_INT_BREQ)) {
+			msb->state = MSB_WB_SEND_INT_REQ;
+			goto again;
+		}
+
+		msb->int_polling = false;
+		msb->state = MSB_WB_SEND_WRITE_DATA;
+		/* fallthrough */
+
+	case MSB_WB_SEND_WRITE_DATA:
+		sg_init_table(sg, ARRAY_SIZE(sg));
+
+		if (msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg),
+			msb->current_sg_offset,
+			msb->page_size) < msb->page_size)
+			return msb_exit_state_machine(msb, -EIO);
+
+		memstick_init_req_sg(mrq, MS_TPC_WRITE_LONG_DATA, sg);
+		mrq->need_card_int = 1;
+		msb->state = MSB_WB_RECEIVE_WRITE_CONFIRMATION;
+		return 0;
+
+	case MSB_WB_RECEIVE_WRITE_CONFIRMATION:
+		msb->current_page++;
+		msb->current_sg_offset += msb->page_size;
+		msb->state = MSB_WB_SEND_INT_REQ;
+		goto again;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+/*
+ * This function is used to send simple IO requests to device that consist
+ * of register write + command
+ */
+static int h_msb_send_command(struct memstick_dev *card,
+					struct memstick_request **out_mrq)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_request *mrq = *out_mrq = &card->current_mrq;
+	u8 intreg;
+
+	if (mrq->error) {
+		dbg("send_command: unknown error");
+		return msb_exit_state_machine(msb, mrq->error);
+	}
+again:
+	switch (msb->state) {
+
+	/* HACK: see h_msb_write_block */
+	case MSB_SC_SEND_WRITE_PARAMS: /* write param register*/
+		if (!msb_write_regs(msb,
+			offsetof(struct ms_register, param),
+			sizeof(struct ms_param_register),
+			&msb->regs.param))
+			return 0;
+		msb->state = MSB_SC_SEND_WRITE_OOB;
+		return 0;
+
+	case MSB_SC_SEND_WRITE_OOB:
+		if (!msb->command_need_oob) {
+			msb->state = MSB_SC_SEND_COMMAND;
+			goto again;
+		}
+
+		if (!msb_write_regs(msb,
+			offsetof(struct ms_register, extra_data),
+			sizeof(struct ms_extra_data_register),
+			&msb->regs.extra_data))
+			return 0;
+
+		msb->state = MSB_SC_SEND_COMMAND;
+		return 0;
+
+	case MSB_SC_SEND_COMMAND:
+		memstick_init_req(mrq, MS_TPC_SET_CMD, &msb->command_value, 1);
+		msb->state = MSB_SC_SEND_INT_REQ;
+		return 0;
+
+	case MSB_SC_SEND_INT_REQ:
+		msb->state = MSB_SC_RECEIVE_INT_REQ;
+		if (msb_read_int_reg(msb, -1))
+			return 0;
+		/* fallthrough */
+
+	case MSB_SC_RECEIVE_INT_REQ:
+		intreg = mrq->data[0];
+
+		if (intreg & MEMSTICK_INT_CMDNAK)
+			return msb_exit_state_machine(msb, -EIO);
+		if (intreg & MEMSTICK_INT_ERR)
+			return msb_exit_state_machine(msb, -EBADMSG);
+
+		if (!(intreg & MEMSTICK_INT_CED)) {
+			msb->state = MSB_SC_SEND_INT_REQ;
+			goto again;
+		}
+
+		return msb_exit_state_machine(msb, 0);
+	}
+
+	BUG();
+}
+
+/* Small handler for card reset */
+static int h_msb_reset(struct memstick_dev *card,
+					struct memstick_request **out_mrq)
+{
+	u8 command = MS_CMD_RESET;
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_request *mrq = *out_mrq = &card->current_mrq;
+
+	if (mrq->error)
+		return msb_exit_state_machine(msb, mrq->error);
+
+	switch (msb->state) {
+	case MSB_RS_SEND:
+		memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1);
+		mrq->need_card_int = 0;
+		msb->state = MSB_RS_CONFIRM;
+		return 0;
+	case MSB_RS_CONFIRM:
+		return msb_exit_state_machine(msb, 0);
+	}
+	BUG();
+}
+
+/* This handler is used to do serial->parallel switch */
+static int h_msb_parallel_switch(struct memstick_dev *card,
+					struct memstick_request **out_mrq)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_request *mrq = *out_mrq = &card->current_mrq;
+	struct memstick_host *host = card->host;
+
+	if (mrq->error) {
+		dbg("parallel_switch: error");
+		msb->regs.param.system &= ~MEMSTICK_SYS_PAM;
+		return msb_exit_state_machine(msb, mrq->error);
+	}
+
+	switch (msb->state) {
+	case MSB_PS_SEND_SWITCH_COMMAND:
+		/* Set the parallel interface on memstick side */
+		msb->regs.param.system |= MEMSTICK_SYS_PAM;
+
+		if (!msb_write_regs(msb,
+			offsetof(struct ms_register, param),
+			1,
+			(unsigned char *)&msb->regs.param))
+			return 0;
+
+		msb->state = MSB_PS_SWICH_HOST;
+		return 0;
+
+	case MSB_PS_SWICH_HOST:
+		 /* Set parallel interface on our side + send a dummy request
+			to see if card responds */
+		host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_PAR4);
+		memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1);
+		msb->state = MSB_PS_CONFIRM;
+		return 0;
+
+	case MSB_PS_CONFIRM:
+		return msb_exit_state_machine(msb, 0);
+	}
+
+	BUG();
+}
+
+static int msb_switch_to_parallel(struct msb_data *msb);
+
+/* Reset the card, to guard against hw errors beeing treated as bad blocks */
+static int msb_reset(struct msb_data *msb, bool full)
+{
+
+	bool was_parallel = msb->regs.param.system & MEMSTICK_SYS_PAM;
+	struct memstick_dev *card = msb->card;
+	struct memstick_host *host = card->host;
+	int error;
+
+	/* Reset the card */
+	msb->regs.param.system = MEMSTICK_SYS_BAMD;
+
+	if (full) {
+		error =  host->set_param(host,
+					MEMSTICK_POWER, MEMSTICK_POWER_OFF);
+		if (error)
+			goto out_error;
+
+		msb_invalidate_reg_window(msb);
+
+		error = host->set_param(host,
+					MEMSTICK_POWER, MEMSTICK_POWER_ON);
+		if (error)
+			goto out_error;
+
+		error = host->set_param(host,
+					MEMSTICK_INTERFACE, MEMSTICK_SERIAL);
+		if (error) {
+out_error:
+			dbg("Failed to reset the host controller");
+			msb->read_only = true;
+			return -EFAULT;
+		}
+	}
+
+	error = msb_run_state_machine(msb, h_msb_reset);
+	if (error) {
+		dbg("Failed to reset the card");
+		msb->read_only = true;
+		return -ENODEV;
+	}
+
+	/* Set parallel mode */
+	if (was_parallel)
+		msb_switch_to_parallel(msb);
+	return 0;
+}
+
+/* Attempts to switch interface to parallel mode */
+static int msb_switch_to_parallel(struct msb_data *msb)
+{
+	int error;
+
+	error = msb_run_state_machine(msb, h_msb_parallel_switch);
+	if (error) {
+		pr_err("Switch to parallel failed");
+		msb->regs.param.system &= ~MEMSTICK_SYS_PAM;
+		msb_reset(msb, true);
+		return -EFAULT;
+	}
+
+	msb->caps |= MEMSTICK_CAP_AUTO_GET_INT;
+	return 0;
+}
+
+/* Changes overwrite flag on a page */
+static int msb_set_overwrite_flag(struct msb_data *msb,
+						u16 pba, u8 page, u8 flag)
+{
+	if (msb->read_only)
+		return -EROFS;
+
+	msb->regs.param.block_address = cpu_to_be16(pba);
+	msb->regs.param.page_address = page;
+	msb->regs.param.cp = MEMSTICK_CP_OVERWRITE;
+	msb->regs.extra_data.overwrite_flag = flag;
+	msb->command_value = MS_CMD_BLOCK_WRITE;
+	msb->command_need_oob = true;
+
+	dbg_verbose("changing overwrite flag to %02x for sector %d, page %d",
+							flag, pba, page);
+	return msb_run_state_machine(msb, h_msb_send_command);
+}
+
+static int msb_mark_bad(struct msb_data *msb, int pba)
+{
+	pr_notice("marking pba %d as bad", pba);
+	msb_reset(msb, true);
+	return msb_set_overwrite_flag(
+			msb, pba, 0, 0xFF & ~MEMSTICK_OVERWRITE_BKST);
+}
+
+static int msb_mark_page_bad(struct msb_data *msb, int pba, int page)
+{
+	dbg("marking page %d of pba %d as bad", page, pba);
+	msb_reset(msb, true);
+	return msb_set_overwrite_flag(msb,
+		pba, page, ~MEMSTICK_OVERWRITE_PGST0);
+}
+
+/* Erases one physical block */
+static int msb_erase_block(struct msb_data *msb, u16 pba)
+{
+	int error, try;
+	if (msb->read_only)
+		return -EROFS;
+
+	dbg_verbose("erasing pba %d", pba);
+
+	for (try = 1; try < 3; try++) {
+		msb->regs.param.block_address = cpu_to_be16(pba);
+		msb->regs.param.page_address = 0;
+		msb->regs.param.cp = MEMSTICK_CP_BLOCK;
+		msb->command_value = MS_CMD_BLOCK_ERASE;
+		msb->command_need_oob = false;
+
+
+		error = msb_run_state_machine(msb, h_msb_send_command);
+		if (!error || msb_reset(msb, true))
+			break;
+	}
+
+	if (error) {
+		pr_err("erase failed, marking pba %d as bad", pba);
+		msb_mark_bad(msb, pba);
+	}
+
+	dbg_verbose("erase success, marking pba %d as unused", pba);
+	msb_mark_block_unused(msb, pba);
+	__set_bit(pba, msb->erased_blocks_bitmap);
+	return error;
+}
+
+/* Reads one page from device */
+static int msb_read_page(struct msb_data *msb,
+	u16 pba, u8 page, struct ms_extra_data_register *extra,
+					struct scatterlist *sg,  int offset)
+{
+	int try, error;
+
+	if (pba == MS_BLOCK_INVALID) {
+		unsigned long flags;
+		struct sg_mapping_iter miter;
+		size_t len = msb->page_size;
+
+		dbg_verbose("read unmapped sector. returning 0xFF");
+
+		local_irq_save(flags);
+		sg_miter_start(&miter, sg, sg_nents(sg),
+				SG_MITER_ATOMIC | SG_MITER_TO_SG);
+
+		while (sg_miter_next(&miter) && len > 0) {
+
+			int chunklen;
+
+			if (offset && offset >= miter.length) {
+				offset -= miter.length;
+				continue;
+			}
+
+			chunklen = min(miter.length - offset, len);
+			memset(miter.addr + offset, 0xFF, chunklen);
+			len -= chunklen;
+			offset = 0;
+		}
+
+		sg_miter_stop(&miter);
+		local_irq_restore(flags);
+
+		if (offset)
+			return -EFAULT;
+
+		if (extra)
+			memset(extra, 0xFF, sizeof(*extra));
+		return 0;
+	}
+
+	if (pba >= msb->block_count) {
+		pr_err("BUG: attempt to read beyond the end of the card at pba %d", pba);
+		return -EINVAL;
+	}
+
+	for (try = 1; try < 3; try++) {
+		msb->regs.param.block_address = cpu_to_be16(pba);
+		msb->regs.param.page_address = page;
+		msb->regs.param.cp = MEMSTICK_CP_PAGE;
+
+		msb->current_sg = sg;
+		msb->current_sg_offset = offset;
+		error = msb_run_state_machine(msb, h_msb_read_page);
+
+
+		if (error == -EUCLEAN) {
+			pr_notice("correctable error on pba %d, page %d",
+				pba, page);
+			error = 0;
+		}
+
+		if (!error && extra)
+			*extra = msb->regs.extra_data;
+
+		if (!error || msb_reset(msb, true))
+			break;
+
+	}
+
+	/* Mark bad pages */
+	if (error == -EBADMSG) {
+		pr_err("uncorrectable error on read of pba %d, page %d",
+			pba, page);
+
+		if (msb->regs.extra_data.overwrite_flag &
+					MEMSTICK_OVERWRITE_PGST0)
+			msb_mark_page_bad(msb, pba, page);
+		return -EBADMSG;
+	}
+
+	if (error)
+		pr_err("read of pba %d, page %d failed with error %d",
+			pba, page, error);
+	return error;
+}
+
+/* Reads oob of page only */
+static int msb_read_oob(struct msb_data *msb, u16 pba, u16 page,
+	struct ms_extra_data_register *extra)
+{
+	int error;
+
+	BUG_ON(!extra);
+	msb->regs.param.block_address = cpu_to_be16(pba);
+	msb->regs.param.page_address = page;
+	msb->regs.param.cp = MEMSTICK_CP_EXTRA;
+
+	if (pba > msb->block_count) {
+		pr_err("BUG: attempt to read beyond the end of card at pba %d", pba);
+		return -EINVAL;
+	}
+
+	error = msb_run_state_machine(msb, h_msb_read_page);
+	*extra = msb->regs.extra_data;
+
+	if (error == -EUCLEAN) {
+		pr_notice("correctable error on pba %d, page %d",
+			pba, page);
+		return 0;
+	}
+
+	return error;
+}
+
+/* Reads a block and compares it with data contained in scatterlist orig_sg */
+static int msb_verify_block(struct msb_data *msb, u16 pba,
+				struct scatterlist *orig_sg,  int offset)
+{
+	struct scatterlist sg;
+	int page = 0, error;
+
+	sg_init_one(&sg, msb->block_buffer, msb->block_size);
+
+	while (page < msb->pages_in_block) {
+
+		error = msb_read_page(msb, pba, page,
+				NULL, &sg, page * msb->page_size);
+		if (error)
+			return error;
+		page++;
+	}
+
+	if (msb_sg_compare_to_buffer(orig_sg, offset,
+				msb->block_buffer, msb->block_size))
+		return -EIO;
+	return 0;
+}
+
+/* Writes exectly one block + oob */
+static int msb_write_block(struct msb_data *msb,
+			u16 pba, u32 lba, struct scatterlist *sg, int offset)
+{
+	int error, current_try = 1;
+	BUG_ON(sg->length < msb->page_size);
+
+	if (msb->read_only)
+		return -EROFS;
+
+	if (pba == MS_BLOCK_INVALID) {
+		pr_err(
+			"BUG: write: attempt to write MS_BLOCK_INVALID block");
+		return -EINVAL;
+	}
+
+	if (pba >= msb->block_count || lba >= msb->logical_block_count) {
+		pr_err(
+		"BUG: write: attempt to write beyond the end of device");
+		return -EINVAL;
+	}
+
+	if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) {
+		pr_err("BUG: write: lba zone mismatch");
+		return -EINVAL;
+	}
+
+	if (pba == msb->boot_block_locations[0] ||
+		pba == msb->boot_block_locations[1]) {
+		pr_err("BUG: write: attempt to write to boot blocks!");
+		return -EINVAL;
+	}
+
+	while (1) {
+
+		if (msb->read_only)
+			return -EROFS;
+
+		msb->regs.param.cp = MEMSTICK_CP_BLOCK;
+		msb->regs.param.page_address = 0;
+		msb->regs.param.block_address = cpu_to_be16(pba);
+
+		msb->regs.extra_data.management_flag = 0xFF;
+		msb->regs.extra_data.overwrite_flag = 0xF8;
+		msb->regs.extra_data.logical_address = cpu_to_be16(lba);
+
+		msb->current_sg = sg;
+		msb->current_sg_offset = offset;
+		msb->current_page = 0;
+
+		error = msb_run_state_machine(msb, h_msb_write_block);
+
+		/* Sector we just wrote to is assumed erased since its pba
+			was erased. If it wasn't erased, write will succeed
+			and will just clear the bits that were set in the block
+			thus test that what we have written,
+			matches what we expect.
+			We do trust the blocks that we erased */
+		if (!error && (verify_writes ||
+				!test_bit(pba, msb->erased_blocks_bitmap)))
+			error = msb_verify_block(msb, pba, sg, offset);
+
+		if (!error)
+			break;
+
+		if (current_try > 1 || msb_reset(msb, true))
+			break;
+
+		pr_err("write failed, trying to erase the pba %d", pba);
+		error = msb_erase_block(msb, pba);
+		if (error)
+			break;
+
+		current_try++;
+	}
+	return error;
+}
+
+/* Finds a free block for write replacement */
+static u16 msb_get_free_block(struct msb_data *msb, int zone)
+{
+	u16 pos;
+	int pba = zone * MS_BLOCKS_IN_ZONE;
+	int i;
+
+	get_random_bytes(&pos, sizeof(pos));
+
+	if (!msb->free_block_count[zone]) {
+		pr_err("NO free blocks in the zone %d, to use for a write, (media is WORN out) switching to RO mode", zone);
+		msb->read_only = true;
+		return MS_BLOCK_INVALID;
+	}
+
+	pos %= msb->free_block_count[zone];
+
+	dbg_verbose("have %d choices for a free block, selected randomally: %d",
+		msb->free_block_count[zone], pos);
+
+	pba = find_next_zero_bit(msb->used_blocks_bitmap,
+							msb->block_count, pba);
+	for (i = 0; i < pos; ++i)
+		pba = find_next_zero_bit(msb->used_blocks_bitmap,
+						msb->block_count, pba + 1);
+
+	dbg_verbose("result of the free blocks scan: pba %d", pba);
+
+	if (pba == msb->block_count || (msb_get_zone_from_pba(pba)) != zone) {
+		pr_err("BUG: cant get a free block");
+		msb->read_only = true;
+		return MS_BLOCK_INVALID;
+	}
+
+	msb_mark_block_used(msb, pba);
+	return pba;
+}
+
+static int msb_update_block(struct msb_data *msb, u16 lba,
+	struct scatterlist *sg, int offset)
+{
+	u16 pba, new_pba;
+	int error, try;
+
+	pba = msb->lba_to_pba_table[lba];
+	dbg_verbose("start of a block update at lba  %d, pba %d", lba, pba);
+
+	if (pba != MS_BLOCK_INVALID) {
+		dbg_verbose("setting the update flag on the block");
+		msb_set_overwrite_flag(msb, pba, 0,
+				0xFF & ~MEMSTICK_OVERWRITE_UDST);
+	}
+
+	for (try = 0; try < 3; try++) {
+		new_pba = msb_get_free_block(msb,
+			msb_get_zone_from_lba(lba));
+
+		if (new_pba == MS_BLOCK_INVALID) {
+			error = -EIO;
+			goto out;
+		}
+
+		dbg_verbose("block update: writing updated block to the pba %d",
+								new_pba);
+		error = msb_write_block(msb, new_pba, lba, sg, offset);
+		if (error == -EBADMSG) {
+			msb_mark_bad(msb, new_pba);
+			continue;
+		}
+
+		if (error)
+			goto out;
+
+		dbg_verbose("block update: erasing the old block");
+		msb_erase_block(msb, pba);
+		msb->lba_to_pba_table[lba] = new_pba;
+		return 0;
+	}
+out:
+	if (error) {
+		pr_err("block update error after %d tries,  switching to r/o mode", try);
+		msb->read_only = true;
+	}
+	return error;
+}
+
+/* Converts endiannes in the boot block for easy use */
+static void msb_fix_boot_page_endianness(struct ms_boot_page *p)
+{
+	p->header.block_id = be16_to_cpu(p->header.block_id);
+	p->header.format_reserved = be16_to_cpu(p->header.format_reserved);
+	p->entry.disabled_block.start_addr
+		= be32_to_cpu(p->entry.disabled_block.start_addr);
+	p->entry.disabled_block.data_size
+		= be32_to_cpu(p->entry.disabled_block.data_size);
+	p->entry.cis_idi.start_addr
+		= be32_to_cpu(p->entry.cis_idi.start_addr);
+	p->entry.cis_idi.data_size
+		= be32_to_cpu(p->entry.cis_idi.data_size);
+	p->attr.block_size = be16_to_cpu(p->attr.block_size);
+	p->attr.number_of_blocks = be16_to_cpu(p->attr.number_of_blocks);
+	p->attr.number_of_effective_blocks
+		= be16_to_cpu(p->attr.number_of_effective_blocks);
+	p->attr.page_size = be16_to_cpu(p->attr.page_size);
+	p->attr.memory_manufacturer_code
+		= be16_to_cpu(p->attr.memory_manufacturer_code);
+	p->attr.memory_device_code = be16_to_cpu(p->attr.memory_device_code);
+	p->attr.implemented_capacity
+		= be16_to_cpu(p->attr.implemented_capacity);
+	p->attr.controller_number = be16_to_cpu(p->attr.controller_number);
+	p->attr.controller_function = be16_to_cpu(p->attr.controller_function);
+}
+
+static int msb_read_boot_blocks(struct msb_data *msb)
+{
+	int pba = 0;
+	struct scatterlist sg;
+	struct ms_extra_data_register extra;
+	struct ms_boot_page *page;
+
+	msb->boot_block_locations[0] = MS_BLOCK_INVALID;
+	msb->boot_block_locations[1] = MS_BLOCK_INVALID;
+	msb->boot_block_count = 0;
+
+	dbg_verbose("Start of a scan for the boot blocks");
+
+	if (!msb->boot_page) {
+		page = kmalloc(sizeof(struct ms_boot_page)*2, GFP_KERNEL);
+		if (!page)
+			return -ENOMEM;
+
+		msb->boot_page = page;
+	} else
+		page = msb->boot_page;
+
+	msb->block_count = MS_BLOCK_MAX_BOOT_ADDR;
+
+	for (pba = 0; pba < MS_BLOCK_MAX_BOOT_ADDR; pba++) {
+
+		sg_init_one(&sg, page, sizeof(*page));
+		if (msb_read_page(msb, pba, 0, &extra, &sg, 0)) {
+			dbg("boot scan: can't read pba %d", pba);
+			continue;
+		}
+
+		if (extra.management_flag & MEMSTICK_MANAGEMENT_SYSFLG) {
+			dbg("managment flag doesn't indicate boot block %d",
+									pba);
+			continue;
+		}
+
+		if (be16_to_cpu(page->header.block_id) != MS_BLOCK_BOOT_ID) {
+			dbg("the pba at %d doesn' contain boot block ID", pba);
+			continue;
+		}
+
+		msb_fix_boot_page_endianness(page);
+		msb->boot_block_locations[msb->boot_block_count] = pba;
+
+		page++;
+		msb->boot_block_count++;
+
+		if (msb->boot_block_count == 2)
+			break;
+	}
+
+	if (!msb->boot_block_count) {
+		pr_err("media doesn't contain master page, aborting");
+		return -EIO;
+	}
+
+	dbg_verbose("End of scan for boot blocks");
+	return 0;
+}
+
+static int msb_read_bad_block_table(struct msb_data *msb, int block_nr)
+{
+	struct ms_boot_page *boot_block;
+	struct scatterlist sg;
+	u16 *buffer = NULL;
+	int offset = 0;
+	int i, error = 0;
+	int data_size, data_offset, page, page_offset, size_to_read;
+	u16 pba;
+
+	BUG_ON(block_nr > 1);
+	boot_block = &msb->boot_page[block_nr];
+	pba = msb->boot_block_locations[block_nr];
+
+	if (msb->boot_block_locations[block_nr] == MS_BLOCK_INVALID)
+		return -EINVAL;
+
+	data_size = boot_block->entry.disabled_block.data_size;
+	data_offset = sizeof(struct ms_boot_page) +
+			boot_block->entry.disabled_block.start_addr;
+	if (!data_size)
+		return 0;
+
+	page = data_offset / msb->page_size;
+	page_offset = data_offset % msb->page_size;
+	size_to_read =
+		DIV_ROUND_UP(data_size + page_offset, msb->page_size) *
+			msb->page_size;
+
+	dbg("reading bad block of boot block at pba %d, offset %d len %d",
+		pba, data_offset, data_size);
+
+	buffer = kzalloc(size_to_read, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	/* Read the buffer */
+	sg_init_one(&sg, buffer, size_to_read);
+
+	while (offset < size_to_read) {
+		error = msb_read_page(msb, pba, page, NULL, &sg, offset);
+		if (error)
+			goto out;
+
+		page++;
+		offset += msb->page_size;
+
+		if (page == msb->pages_in_block) {
+			pr_err(
+			"bad block table extends beyond the boot block");
+			break;
+		}
+	}
+
+	/* Process the bad block table */
+	for (i = page_offset; i < data_size / sizeof(u16); i++) {
+
+		u16 bad_block = be16_to_cpu(buffer[i]);
+
+		if (bad_block >= msb->block_count) {
+			dbg("bad block table contains invalid block %d",
+								bad_block);
+			continue;
+		}
+
+		if (test_bit(bad_block, msb->used_blocks_bitmap))  {
+			dbg("duplicate bad block %d in the table",
+				bad_block);
+			continue;
+		}
+
+		dbg("block %d is marked as factory bad", bad_block);
+		msb_mark_block_used(msb, bad_block);
+	}
+out:
+	kfree(buffer);
+	return error;
+}
+
+static int msb_ftl_initialize(struct msb_data *msb)
+{
+	int i;
+
+	if (msb->ftl_initialized)
+		return 0;
+
+	msb->zone_count = msb->block_count / MS_BLOCKS_IN_ZONE;
+	msb->logical_block_count = msb->zone_count * 496 - 2;
+
+	msb->used_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL);
+	msb->erased_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL);
+	msb->lba_to_pba_table =
+		kmalloc(msb->logical_block_count * sizeof(u16), GFP_KERNEL);
+
+	if (!msb->used_blocks_bitmap || !msb->lba_to_pba_table ||
+						!msb->erased_blocks_bitmap) {
+		kfree(msb->used_blocks_bitmap);
+		kfree(msb->lba_to_pba_table);
+		kfree(msb->erased_blocks_bitmap);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < msb->zone_count; i++)
+		msb->free_block_count[i] = MS_BLOCKS_IN_ZONE;
+
+	memset(msb->lba_to_pba_table, MS_BLOCK_INVALID,
+			msb->logical_block_count * sizeof(u16));
+
+	dbg("initial FTL tables created. Zone count = %d, Logical block count = %d",
+		msb->zone_count, msb->logical_block_count);
+
+	msb->ftl_initialized = true;
+	return 0;
+}
+
+static int msb_ftl_scan(struct msb_data *msb)
+{
+	u16 pba, lba, other_block;
+	u8 overwrite_flag, managment_flag, other_overwrite_flag;
+	int error;
+	struct ms_extra_data_register extra;
+	u8 *overwrite_flags = kzalloc(msb->block_count, GFP_KERNEL);
+
+	if (!overwrite_flags)
+		return -ENOMEM;
+
+	dbg("Start of media scanning");
+	for (pba = 0; pba < msb->block_count; pba++) {
+
+		if (pba == msb->boot_block_locations[0] ||
+			pba == msb->boot_block_locations[1]) {
+			dbg_verbose("pba %05d -> [boot block]", pba);
+			msb_mark_block_used(msb, pba);
+			continue;
+		}
+
+		if (test_bit(pba, msb->used_blocks_bitmap)) {
+			dbg_verbose("pba %05d -> [factory bad]", pba);
+			continue;
+		}
+
+		memset(&extra, 0, sizeof(extra));
+		error = msb_read_oob(msb, pba, 0, &extra);
+
+		/* can't trust the page if we can't read the oob */
+		if (error == -EBADMSG) {
+			pr_notice(
+			"oob of pba %d damaged, will try to erase it", pba);
+			msb_mark_block_used(msb, pba);
+			msb_erase_block(msb, pba);
+			continue;
+		} else if (error) {
+			pr_err("unknown error %d on read of oob of pba %d - aborting",
+				error, pba);
+
+			kfree(overwrite_flags);
+			return error;
+		}
+
+		lba = be16_to_cpu(extra.logical_address);
+		managment_flag = extra.management_flag;
+		overwrite_flag = extra.overwrite_flag;
+		overwrite_flags[pba] = overwrite_flag;
+
+		/* Skip bad blocks */
+		if (!(overwrite_flag & MEMSTICK_OVERWRITE_BKST)) {
+			dbg("pba %05d -> [BAD]", pba);
+			msb_mark_block_used(msb, pba);
+			continue;
+		}
+
+		/* Skip system/drm blocks */
+		if ((managment_flag & MEMSTICK_MANAGMENT_FLAG_NORMAL) !=
+			MEMSTICK_MANAGMENT_FLAG_NORMAL) {
+			dbg("pba %05d -> [reserved managment flag %02x]",
+							pba, managment_flag);
+			msb_mark_block_used(msb, pba);
+			continue;
+		}
+
+		/* Erase temporary tables */
+		if (!(managment_flag & MEMSTICK_MANAGEMENT_ATFLG)) {
+			dbg("pba %05d -> [temp table] - will erase", pba);
+
+			msb_mark_block_used(msb, pba);
+			msb_erase_block(msb, pba);
+			continue;
+		}
+
+		if (lba == MS_BLOCK_INVALID) {
+			dbg_verbose("pba %05d -> [free]", pba);
+			continue;
+		}
+
+		msb_mark_block_used(msb, pba);
+
+		/* Block has LBA not according to zoning*/
+		if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) {
+			pr_notice("pba %05d -> [bad lba %05d] - will erase",
+								pba, lba);
+			msb_erase_block(msb, pba);
+			continue;
+		}
+
+		/* No collisions - great */
+		if (msb->lba_to_pba_table[lba] == MS_BLOCK_INVALID) {
+			dbg_verbose("pba %05d -> [lba %05d]", pba, lba);
+			msb->lba_to_pba_table[lba] = pba;
+			continue;
+		}
+
+		other_block = msb->lba_to_pba_table[lba];
+		other_overwrite_flag = overwrite_flags[other_block];
+
+		pr_notice("Collision between pba %d and pba %d",
+			pba, other_block);
+
+		if (!(overwrite_flag & MEMSTICK_OVERWRITE_UDST)) {
+			pr_notice("pba %d is marked as stable, use it", pba);
+			msb_erase_block(msb, other_block);
+			msb->lba_to_pba_table[lba] = pba;
+			continue;
+		}
+
+		if (!(other_overwrite_flag & MEMSTICK_OVERWRITE_UDST)) {
+			pr_notice("pba %d is marked as stable, use it",
+								other_block);
+			msb_erase_block(msb, pba);
+			continue;
+		}
+
+		pr_notice("collision between blocks %d and %d, without stable flag set on both, erasing pba %d",
+				pba, other_block, other_block);
+
+		msb_erase_block(msb, other_block);
+		msb->lba_to_pba_table[lba] = pba;
+	}
+
+	dbg("End of media scanning");
+	kfree(overwrite_flags);
+	return 0;
+}
+
+static void msb_cache_flush_timer(unsigned long data)
+{
+	struct msb_data *msb = (struct msb_data *)data;
+	msb->need_flush_cache = true;
+	queue_work(msb->io_queue, &msb->io_work);
+}
+
+
+static void msb_cache_discard(struct msb_data *msb)
+{
+	if (msb->cache_block_lba == MS_BLOCK_INVALID)
+		return;
+
+	del_timer_sync(&msb->cache_flush_timer);
+
+	dbg_verbose("Discarding the write cache");
+	msb->cache_block_lba = MS_BLOCK_INVALID;
+	bitmap_zero(&msb->valid_cache_bitmap, msb->pages_in_block);
+}
+
+static int msb_cache_init(struct msb_data *msb)
+{
+	setup_timer(&msb->cache_flush_timer, msb_cache_flush_timer,
+		(unsigned long)msb);
+
+	if (!msb->cache)
+		msb->cache = kzalloc(msb->block_size, GFP_KERNEL);
+	if (!msb->cache)
+		return -ENOMEM;
+
+	msb_cache_discard(msb);
+	return 0;
+}
+
+static int msb_cache_flush(struct msb_data *msb)
+{
+	struct scatterlist sg;
+	struct ms_extra_data_register extra;
+	int page, offset, error;
+	u16 pba, lba;
+
+	if (msb->read_only)
+		return -EROFS;
+
+	if (msb->cache_block_lba == MS_BLOCK_INVALID)
+		return 0;
+
+	lba = msb->cache_block_lba;
+	pba = msb->lba_to_pba_table[lba];
+
+	dbg_verbose("Flushing the write cache of pba %d (LBA %d)",
+						pba, msb->cache_block_lba);
+
+	sg_init_one(&sg, msb->cache , msb->block_size);
+
+	/* Read all missing pages in cache */
+	for (page = 0; page < msb->pages_in_block; page++) {
+
+		if (test_bit(page, &msb->valid_cache_bitmap))
+			continue;
+
+		offset = page * msb->page_size;
+
+		dbg_verbose("reading non-present sector %d of cache block %d",
+			page, lba);
+		error = msb_read_page(msb, pba, page, &extra, &sg, offset);
+
+		/* Bad pages are copied with 00 page status */
+		if (error == -EBADMSG) {
+			pr_err("read error on sector %d, contents probably damaged", page);
+			continue;
+		}
+
+		if (error)
+			return error;
+
+		if ((extra.overwrite_flag & MEMSTICK_OV_PG_NORMAL) !=
+							MEMSTICK_OV_PG_NORMAL) {
+			dbg("page %d is marked as bad", page);
+			continue;
+		}
+
+		set_bit(page, &msb->valid_cache_bitmap);
+	}
+
+	/* Write the cache now */
+	error = msb_update_block(msb, msb->cache_block_lba, &sg, 0);
+	pba = msb->lba_to_pba_table[msb->cache_block_lba];
+
+	/* Mark invalid pages */
+	if (!error) {
+		for (page = 0; page < msb->pages_in_block; page++) {
+
+			if (test_bit(page, &msb->valid_cache_bitmap))
+				continue;
+
+			dbg("marking page %d as containing damaged data",
+				page);
+			msb_set_overwrite_flag(msb,
+				pba , page, 0xFF & ~MEMSTICK_OV_PG_NORMAL);
+		}
+	}
+
+	msb_cache_discard(msb);
+	return error;
+}
+
+static int msb_cache_write(struct msb_data *msb, int lba,
+	int page, bool add_to_cache_only, struct scatterlist *sg, int offset)
+{
+	int error;
+	struct scatterlist sg_tmp[10];
+
+	if (msb->read_only)
+		return -EROFS;
+
+	if (msb->cache_block_lba == MS_BLOCK_INVALID ||
+						lba != msb->cache_block_lba)
+		if (add_to_cache_only)
+			return 0;
+
+	/* If we need to write different block */
+	if (msb->cache_block_lba != MS_BLOCK_INVALID &&
+						lba != msb->cache_block_lba) {
+		dbg_verbose("first flush the cache");
+		error = msb_cache_flush(msb);
+		if (error)
+			return error;
+	}
+
+	if (msb->cache_block_lba  == MS_BLOCK_INVALID) {
+		msb->cache_block_lba  = lba;
+		mod_timer(&msb->cache_flush_timer,
+			jiffies + msecs_to_jiffies(cache_flush_timeout));
+	}
+
+	dbg_verbose("Write of LBA %d page %d to cache ", lba, page);
+
+	sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp));
+	msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp), offset, msb->page_size);
+
+	sg_copy_to_buffer(sg_tmp, sg_nents(sg_tmp),
+		msb->cache + page * msb->page_size, msb->page_size);
+
+	set_bit(page, &msb->valid_cache_bitmap);
+	return 0;
+}
+
+static int msb_cache_read(struct msb_data *msb, int lba,
+				int page, struct scatterlist *sg, int offset)
+{
+	int pba = msb->lba_to_pba_table[lba];
+	struct scatterlist sg_tmp[10];
+	int error = 0;
+
+	if (lba == msb->cache_block_lba &&
+			test_bit(page, &msb->valid_cache_bitmap)) {
+
+		dbg_verbose("Read of LBA %d (pba %d) sector %d from cache",
+							lba, pba, page);
+
+		sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp));
+		msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp),
+			offset, msb->page_size);
+		sg_copy_from_buffer(sg_tmp, sg_nents(sg_tmp),
+			msb->cache + msb->page_size * page,
+							msb->page_size);
+	} else {
+		dbg_verbose("Read of LBA %d (pba %d) sector %d from device",
+							lba, pba, page);
+
+		error = msb_read_page(msb, pba, page, NULL, sg, offset);
+		if (error)
+			return error;
+
+		msb_cache_write(msb, lba, page, true, sg, offset);
+	}
+	return error;
+}
+
+/* Emulated geometry table
+ * This table content isn't that importaint,
+ * One could put here different values, providing that they still
+ * cover whole disk.
+ * 64 MB entry is what windows reports for my 64M memstick */
+
+static const struct chs_entry chs_table[] = {
+/*        size sectors cylynders  heads */
+	{ 4,    16,    247,       2  },
+	{ 8,    16,    495,       2  },
+	{ 16,   16,    495,       4  },
+	{ 32,   16,    991,       4  },
+	{ 64,   16,    991,       8  },
+	{128,   16,    991,       16 },
+	{ 0 }
+};
+
+/* Load information about the card */
+static int msb_init_card(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_host *host = card->host;
+	struct ms_boot_page *boot_block;
+	int error = 0, i, raw_size_in_megs;
+
+	msb->caps = 0;
+
+	if (card->id.class >= MEMSTICK_CLASS_ROM &&
+				card->id.class <= MEMSTICK_CLASS_ROM)
+		msb->read_only = true;
+
+	msb->state = -1;
+	error = msb_reset(msb, false);
+	if (error)
+		return error;
+
+	/* Due to a bug in Jmicron driver written by Alex Dubov,
+	 its serial mode barely works,
+	 so we switch to parallel mode right away */
+	if (host->caps & MEMSTICK_CAP_PAR4)
+		msb_switch_to_parallel(msb);
+
+	msb->page_size = sizeof(struct ms_boot_page);
+
+	/* Read the boot page */
+	error = msb_read_boot_blocks(msb);
+	if (error)
+		return -EIO;
+
+	boot_block = &msb->boot_page[0];
+
+	/* Save intersting attributes from boot page */
+	msb->block_count = boot_block->attr.number_of_blocks;
+	msb->page_size = boot_block->attr.page_size;
+
+	msb->pages_in_block = boot_block->attr.block_size * 2;
+	msb->block_size = msb->page_size * msb->pages_in_block;
+
+	if (msb->page_size > PAGE_SIZE) {
+		/* this isn't supported by linux at all, anyway*/
+		dbg("device page %d size isn't supported", msb->page_size);
+		return -EINVAL;
+	}
+
+	msb->block_buffer = kzalloc(msb->block_size, GFP_KERNEL);
+	if (!msb->block_buffer)
+		return -ENOMEM;
+
+	raw_size_in_megs = (msb->block_size * msb->block_count) >> 20;
+
+	for (i = 0; chs_table[i].size; i++) {
+
+		if (chs_table[i].size != raw_size_in_megs)
+			continue;
+
+		msb->geometry.cylinders = chs_table[i].cyl;
+		msb->geometry.heads = chs_table[i].head;
+		msb->geometry.sectors = chs_table[i].sec;
+		break;
+	}
+
+	if (boot_block->attr.transfer_supporting == 1)
+		msb->caps |= MEMSTICK_CAP_PAR4;
+
+	if (boot_block->attr.device_type & 0x03)
+		msb->read_only = true;
+
+	dbg("Total block count = %d", msb->block_count);
+	dbg("Each block consists of %d pages", msb->pages_in_block);
+	dbg("Page size = %d bytes", msb->page_size);
+	dbg("Parallel mode supported: %d", !!(msb->caps & MEMSTICK_CAP_PAR4));
+	dbg("Read only: %d", msb->read_only);
+
+#if 0
+	/* Now we can switch the interface */
+	if (host->caps & msb->caps & MEMSTICK_CAP_PAR4)
+		msb_switch_to_parallel(msb);
+#endif
+
+	error = msb_cache_init(msb);
+	if (error)
+		return error;
+
+	error = msb_ftl_initialize(msb);
+	if (error)
+		return error;
+
+
+	/* Read the bad block table */
+	error = msb_read_bad_block_table(msb, 0);
+
+	if (error && error != -ENOMEM) {
+		dbg("failed to read bad block table from primary boot block, trying from backup");
+		error = msb_read_bad_block_table(msb, 1);
+	}
+
+	if (error)
+		return error;
+
+	/* *drum roll* Scan the media */
+	error = msb_ftl_scan(msb);
+	if (error) {
+		pr_err("Scan of media failed");
+		return error;
+	}
+
+	return 0;
+
+}
+
+static int msb_do_write_request(struct msb_data *msb, int lba,
+	int page, struct scatterlist *sg, size_t len, int *sucessfuly_written)
+{
+	int error = 0;
+	off_t offset = 0;
+	*sucessfuly_written = 0;
+
+	while (offset < len) {
+		if (page == 0 && len - offset >= msb->block_size) {
+
+			if (msb->cache_block_lba == lba)
+				msb_cache_discard(msb);
+
+			dbg_verbose("Writing whole lba %d", lba);
+			error = msb_update_block(msb, lba, sg, offset);
+			if (error)
+				return error;
+
+			offset += msb->block_size;
+			*sucessfuly_written += msb->block_size;
+			lba++;
+			continue;
+		}
+
+		error = msb_cache_write(msb, lba, page, false, sg, offset);
+		if (error)
+			return error;
+
+		offset += msb->page_size;
+		*sucessfuly_written += msb->page_size;
+
+		page++;
+		if (page == msb->pages_in_block) {
+			page = 0;
+			lba++;
+		}
+	}
+	return 0;
+}
+
+static int msb_do_read_request(struct msb_data *msb, int lba,
+		int page, struct scatterlist *sg, int len, int *sucessfuly_read)
+{
+	int error = 0;
+	int offset = 0;
+	*sucessfuly_read = 0;
+
+	while (offset < len) {
+
+		error = msb_cache_read(msb, lba, page, sg, offset);
+		if (error)
+			return error;
+
+		offset += msb->page_size;
+		*sucessfuly_read += msb->page_size;
+
+		page++;
+		if (page == msb->pages_in_block) {
+			page = 0;
+			lba++;
+		}
+	}
+	return 0;
+}
+
+static void msb_io_work(struct work_struct *work)
+{
+	struct msb_data *msb = container_of(work, struct msb_data, io_work);
+	int page, error, len;
+	sector_t lba;
+	unsigned long flags;
+	struct scatterlist *sg = msb->prealloc_sg;
+
+	dbg_verbose("IO: work started");
+
+	while (1) {
+		spin_lock_irqsave(&msb->q_lock, flags);
+
+		if (msb->need_flush_cache) {
+			msb->need_flush_cache = false;
+			spin_unlock_irqrestore(&msb->q_lock, flags);
+			msb_cache_flush(msb);
+			continue;
+		}
+
+		if (!msb->req) {
+			msb->req = blk_fetch_request(msb->queue);
+			if (!msb->req) {
+				dbg_verbose("IO: no more requests exiting");
+				spin_unlock_irqrestore(&msb->q_lock, flags);
+				return;
+			}
+		}
+
+		spin_unlock_irqrestore(&msb->q_lock, flags);
+
+		/* If card was removed meanwhile */
+		if (!msb->req)
+			return;
+
+		/* process the request */
+		dbg_verbose("IO: processing new request");
+		blk_rq_map_sg(msb->queue, msb->req, sg);
+
+		lba = blk_rq_pos(msb->req);
+
+		sector_div(lba, msb->page_size / 512);
+		page = do_div(lba, msb->pages_in_block);
+
+		if (rq_data_dir(msb->req) == READ)
+			error = msb_do_read_request(msb, lba, page, sg,
+				blk_rq_bytes(msb->req), &len);
+		else
+			error = msb_do_write_request(msb, lba, page, sg,
+				blk_rq_bytes(msb->req), &len);
+
+		spin_lock_irqsave(&msb->q_lock, flags);
+
+		if (len)
+			if (!__blk_end_request(msb->req, 0, len))
+				msb->req = NULL;
+
+		if (error && msb->req) {
+			dbg_verbose("IO: ending one sector of the request with error");
+			if (!__blk_end_request(msb->req, error, msb->page_size))
+				msb->req = NULL;
+		}
+
+		if (msb->req)
+			dbg_verbose("IO: request still pending");
+
+		spin_unlock_irqrestore(&msb->q_lock, flags);
+	}
+}
+
+static DEFINE_IDR(msb_disk_idr); /*set of used disk numbers */
+static DEFINE_MUTEX(msb_disk_lock); /* protects against races in open/release */
+
+static int msb_bd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct msb_data *msb = disk->private_data;
+
+	dbg_verbose("block device open");
+
+	mutex_lock(&msb_disk_lock);
+
+	if (msb && msb->card)
+		msb->usage_count++;
+
+	mutex_unlock(&msb_disk_lock);
+	return 0;
+}
+
+static void msb_data_clear(struct msb_data *msb)
+{
+	kfree(msb->boot_page);
+	kfree(msb->used_blocks_bitmap);
+	kfree(msb->lba_to_pba_table);
+	kfree(msb->cache);
+	msb->card = NULL;
+}
+
+static int msb_disk_release(struct gendisk *disk)
+{
+	struct msb_data *msb = disk->private_data;
+
+	dbg_verbose("block device release");
+	mutex_lock(&msb_disk_lock);
+
+	if (msb) {
+		if (msb->usage_count)
+			msb->usage_count--;
+
+		if (!msb->usage_count) {
+			disk->private_data = NULL;
+			idr_remove(&msb_disk_idr, msb->disk_id);
+			put_disk(disk);
+			kfree(msb);
+		}
+	}
+	mutex_unlock(&msb_disk_lock);
+	return 0;
+}
+
+static void msb_bd_release(struct gendisk *disk, fmode_t mode)
+{
+	msb_disk_release(disk);
+}
+
+static int msb_bd_getgeo(struct block_device *bdev,
+				 struct hd_geometry *geo)
+{
+	struct msb_data *msb = bdev->bd_disk->private_data;
+	*geo = msb->geometry;
+	return 0;
+}
+
+static int msb_prepare_req(struct request_queue *q, struct request *req)
+{
+	if (req->cmd_type != REQ_TYPE_FS &&
+				req->cmd_type != REQ_TYPE_BLOCK_PC) {
+		blk_dump_rq_flags(req, "MS unsupported request");
+		return BLKPREP_KILL;
+	}
+	req->cmd_flags |= REQ_DONTPREP;
+	return BLKPREP_OK;
+}
+
+static void msb_submit_req(struct request_queue *q)
+{
+	struct memstick_dev *card = q->queuedata;
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct request *req = NULL;
+
+	dbg_verbose("Submit request");
+
+	if (msb->card_dead) {
+		dbg("Refusing requests on removed card");
+
+		WARN_ON(!msb->io_queue_stopped);
+
+		while ((req = blk_fetch_request(q)) != NULL)
+			__blk_end_request_all(req, -ENODEV);
+		return;
+	}
+
+	if (msb->req)
+		return;
+
+	if (!msb->io_queue_stopped)
+		queue_work(msb->io_queue, &msb->io_work);
+}
+
+static int msb_check_card(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	return (msb->card_dead == 0);
+}
+
+static void msb_stop(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	unsigned long flags;
+
+	dbg("Stopping all msblock IO");
+
+	spin_lock_irqsave(&msb->q_lock, flags);
+	blk_stop_queue(msb->queue);
+	msb->io_queue_stopped = true;
+	spin_unlock_irqrestore(&msb->q_lock, flags);
+
+	del_timer_sync(&msb->cache_flush_timer);
+	flush_workqueue(msb->io_queue);
+
+	if (msb->req) {
+		spin_lock_irqsave(&msb->q_lock, flags);
+		blk_requeue_request(msb->queue, msb->req);
+		msb->req = NULL;
+		spin_unlock_irqrestore(&msb->q_lock, flags);
+	}
+
+}
+
+static void msb_start(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	unsigned long flags;
+
+	dbg("Resuming IO from msblock");
+
+	msb_invalidate_reg_window(msb);
+
+	spin_lock_irqsave(&msb->q_lock, flags);
+	if (!msb->io_queue_stopped || msb->card_dead) {
+		spin_unlock_irqrestore(&msb->q_lock, flags);
+		return;
+	}
+	spin_unlock_irqrestore(&msb->q_lock, flags);
+
+	/* Kick cache flush anyway, its harmless */
+	msb->need_flush_cache = true;
+	msb->io_queue_stopped = false;
+
+	spin_lock_irqsave(&msb->q_lock, flags);
+	blk_start_queue(msb->queue);
+	spin_unlock_irqrestore(&msb->q_lock, flags);
+
+	queue_work(msb->io_queue, &msb->io_work);
+
+}
+
+static const struct block_device_operations msb_bdops = {
+	.open    = msb_bd_open,
+	.release = msb_bd_release,
+	.getgeo  = msb_bd_getgeo,
+	.owner   = THIS_MODULE
+};
+
+/* Registers the block device */
+static int msb_init_disk(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct memstick_host *host = card->host;
+	int rc;
+	u64 limit = BLK_BOUNCE_HIGH;
+	unsigned long capacity;
+
+	if (host->dev.dma_mask && *(host->dev.dma_mask))
+		limit = *(host->dev.dma_mask);
+
+	mutex_lock(&msb_disk_lock);
+	msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL);
+	mutex_unlock(&msb_disk_lock);
+
+	if (msb->disk_id  < 0)
+		return msb->disk_id;
+
+	msb->disk = alloc_disk(0);
+	if (!msb->disk) {
+		rc = -ENOMEM;
+		goto out_release_id;
+	}
+
+	msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock);
+	if (!msb->queue) {
+		rc = -ENOMEM;
+		goto out_put_disk;
+	}
+
+	msb->queue->queuedata = card;
+	blk_queue_prep_rq(msb->queue, msb_prepare_req);
+
+	blk_queue_bounce_limit(msb->queue, limit);
+	blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
+	blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
+	blk_queue_max_segment_size(msb->queue,
+				   MS_BLOCK_MAX_PAGES * msb->page_size);
+	blk_queue_logical_block_size(msb->queue, msb->page_size);
+
+	sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id);
+	msb->disk->fops = &msb_bdops;
+	msb->disk->private_data = msb;
+	msb->disk->queue = msb->queue;
+	msb->disk->driverfs_dev = &card->dev;
+	msb->disk->flags |= GENHD_FL_EXT_DEVT;
+
+	capacity = msb->pages_in_block * msb->logical_block_count;
+	capacity *= (msb->page_size / 512);
+	set_capacity(msb->disk, capacity);
+	dbg("Set total disk size to %lu sectors", capacity);
+
+	msb->usage_count = 1;
+	msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM);
+	INIT_WORK(&msb->io_work, msb_io_work);
+	sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
+
+	if (msb->read_only)
+		set_disk_ro(msb->disk, 1);
+
+	msb_start(card);
+	add_disk(msb->disk);
+	dbg("Disk added");
+	return 0;
+
+out_put_disk:
+	put_disk(msb->disk);
+out_release_id:
+	mutex_lock(&msb_disk_lock);
+	idr_remove(&msb_disk_idr, msb->disk_id);
+	mutex_unlock(&msb_disk_lock);
+	return rc;
+}
+
+static int msb_probe(struct memstick_dev *card)
+{
+	struct msb_data *msb;
+	int rc = 0;
+
+	msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL);
+	if (!msb)
+		return -ENOMEM;
+	memstick_set_drvdata(card, msb);
+	msb->card = card;
+	spin_lock_init(&msb->q_lock);
+
+	rc = msb_init_card(card);
+	if (rc)
+		goto out_free;
+
+	rc = msb_init_disk(card);
+	if (!rc) {
+		card->check = msb_check_card;
+		card->stop = msb_stop;
+		card->start = msb_start;
+		return 0;
+	}
+out_free:
+	memstick_set_drvdata(card, NULL);
+	msb_data_clear(msb);
+	kfree(msb);
+	return rc;
+}
+
+static void msb_remove(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	unsigned long flags;
+
+	if (!msb->io_queue_stopped)
+		msb_stop(card);
+
+	dbg("Removing the disk device");
+
+	/* Take care of unhandled + new requests from now on */
+	spin_lock_irqsave(&msb->q_lock, flags);
+	msb->card_dead = true;
+	blk_start_queue(msb->queue);
+	spin_unlock_irqrestore(&msb->q_lock, flags);
+
+	/* Remove the disk */
+	del_gendisk(msb->disk);
+	blk_cleanup_queue(msb->queue);
+	msb->queue = NULL;
+
+	mutex_lock(&msb_disk_lock);
+	msb_data_clear(msb);
+	mutex_unlock(&msb_disk_lock);
+
+	msb_disk_release(msb->disk);
+	memstick_set_drvdata(card, NULL);
+}
+
+#ifdef CONFIG_PM
+
+static int msb_suspend(struct memstick_dev *card, pm_message_t state)
+{
+	msb_stop(card);
+	return 0;
+}
+
+static int msb_resume(struct memstick_dev *card)
+{
+	struct msb_data *msb = memstick_get_drvdata(card);
+	struct msb_data *new_msb = NULL;
+	bool card_dead = true;
+
+#ifndef CONFIG_MEMSTICK_UNSAFE_RESUME
+	msb->card_dead = true;
+	return 0;
+#endif
+	mutex_lock(&card->host->lock);
+
+	new_msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL);
+	if (!new_msb)
+		goto out;
+
+	new_msb->card = card;
+	memstick_set_drvdata(card, new_msb);
+	spin_lock_init(&new_msb->q_lock);
+	sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1);
+
+	if (msb_init_card(card))
+		goto out;
+
+	if (msb->block_size != new_msb->block_size)
+		goto out;
+
+	if (memcmp(msb->boot_page, new_msb->boot_page,
+					sizeof(struct ms_boot_page)))
+		goto out;
+
+	if (msb->logical_block_count != new_msb->logical_block_count ||
+		memcmp(msb->lba_to_pba_table, new_msb->lba_to_pba_table,
+						msb->logical_block_count))
+		goto out;
+
+	if (msb->block_count != new_msb->block_count ||
+		memcmp(msb->used_blocks_bitmap, new_msb->used_blocks_bitmap,
+							msb->block_count / 8))
+		goto out;
+
+	card_dead = false;
+out:
+	if (card_dead)
+		dbg("Card was removed/replaced during suspend");
+
+	msb->card_dead = card_dead;
+	memstick_set_drvdata(card, msb);
+
+	if (new_msb) {
+		msb_data_clear(new_msb);
+		kfree(new_msb);
+	}
+
+	msb_start(card);
+	mutex_unlock(&card->host->lock);
+	return 0;
+}
+#else
+
+#define msb_suspend NULL
+#define msb_resume NULL
+
+#endif /* CONFIG_PM */
+
+static struct memstick_device_id msb_id_tbl[] = {
+	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
+	 MEMSTICK_CLASS_FLASH},
+
+	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
+	 MEMSTICK_CLASS_ROM},
+
+	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
+	 MEMSTICK_CLASS_RO},
+
+	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE,
+	 MEMSTICK_CLASS_WP},
+
+	{MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_DUO, MEMSTICK_CATEGORY_STORAGE_DUO,
+	 MEMSTICK_CLASS_DUO},
+	{}
+};
+MODULE_DEVICE_TABLE(memstick, msb_id_tbl);
+
+
+static struct memstick_driver msb_driver = {
+	.driver = {
+		.name  = DRIVER_NAME,
+		.owner = THIS_MODULE
+	},
+	.id_table = msb_id_tbl,
+	.probe    = msb_probe,
+	.remove   = msb_remove,
+	.suspend  = msb_suspend,
+	.resume   = msb_resume
+};
+
+static int major;
+
+static int __init msb_init(void)
+{
+	int rc = register_blkdev(0, DRIVER_NAME);
+
+	if (rc < 0) {
+		pr_err("failed to register major (error %d)\n", rc);
+		return rc;
+	}
+
+	major = rc;
+	rc = memstick_register_driver(&msb_driver);
+	if (rc) {
+		unregister_blkdev(major, DRIVER_NAME);
+		pr_err("failed to register memstick driver (error %d)\n", rc);
+	}
+
+	return rc;
+}
+
+static void __exit msb_exit(void)
+{
+	memstick_unregister_driver(&msb_driver);
+	unregister_blkdev(major, DRIVER_NAME);
+	idr_destroy(&msb_disk_idr);
+}
+
+module_init(msb_init);
+module_exit(msb_exit);
+
+module_param(cache_flush_timeout, int, S_IRUGO);
+MODULE_PARM_DESC(cache_flush_timeout,
+				"Cache flush timeout in msec (1000 default)");
+module_param(debug, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(debug, "Debug level (0-2)");
+
+module_param(verify_writes, bool, S_IRUGO);
+MODULE_PARM_DESC(verify_writes, "Read back and check all data that is written");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Maxim Levitsky");
+MODULE_DESCRIPTION("Sony MemoryStick block device driver");
diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h
new file mode 100644
index 000000000000..96e637550988
--- /dev/null
+++ b/drivers/memstick/core/ms_block.h
@@ -0,0 +1,290 @@
+/*
+ *  ms_block.h - Sony MemoryStick (legacy) storage support
+
+ *  Copyright (C) 2013 Maxim Levitsky <maximlevitsky@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Minor portions of the driver are copied from mspro_block.c which is
+ * Copyright (C) 2007 Alex Dubov <oakad@yahoo.com>
+ *
+ * Also ms structures were copied from old broken driver by same author
+ * These probably come from MS spec
+ *
+ */
+
+#ifndef MS_BLOCK_NEW_H
+#define MS_BLOCK_NEW_H
+
+#define MS_BLOCK_MAX_SEGS      32
+#define MS_BLOCK_MAX_PAGES     ((2 << 16) - 1)
+
+#define MS_BLOCK_MAX_BOOT_ADDR 0x000c
+#define MS_BLOCK_BOOT_ID       0x0001
+#define MS_BLOCK_INVALID       0xffff
+#define MS_MAX_ZONES           16
+#define MS_BLOCKS_IN_ZONE      512
+
+#define MS_BLOCK_MAP_LINE_SZ   16
+#define MS_BLOCK_PART_SHIFT    3
+
+
+#define MEMSTICK_UNCORR_ERROR (MEMSTICK_STATUS1_UCFG | \
+		MEMSTICK_STATUS1_UCEX | MEMSTICK_STATUS1_UCDT)
+
+#define MEMSTICK_CORR_ERROR (MEMSTICK_STATUS1_FGER | MEMSTICK_STATUS1_EXER | \
+	MEMSTICK_STATUS1_DTER)
+
+#define MEMSTICK_INT_ERROR (MEMSTICK_INT_CMDNAK | MEMSTICK_INT_ERR)
+
+#define MEMSTICK_OVERWRITE_FLAG_NORMAL \
+	(MEMSTICK_OVERWRITE_PGST1 | \
+	MEMSTICK_OVERWRITE_PGST0  | \
+	MEMSTICK_OVERWRITE_BKST)
+
+#define MEMSTICK_OV_PG_NORMAL \
+	(MEMSTICK_OVERWRITE_PGST1 | MEMSTICK_OVERWRITE_PGST0)
+
+#define MEMSTICK_MANAGMENT_FLAG_NORMAL \
+	(MEMSTICK_MANAGEMENT_SYSFLG |  \
+	MEMSTICK_MANAGEMENT_SCMS1   |  \
+	MEMSTICK_MANAGEMENT_SCMS0)     \
+
+struct ms_boot_header {
+	unsigned short block_id;
+	unsigned short format_reserved;
+	unsigned char  reserved0[184];
+	unsigned char  data_entry;
+	unsigned char  reserved1[179];
+} __packed;
+
+
+struct ms_system_item {
+	unsigned int  start_addr;
+	unsigned int  data_size;
+	unsigned char data_type_id;
+	unsigned char reserved[3];
+} __packed;
+
+struct ms_system_entry {
+	struct ms_system_item disabled_block;
+	struct ms_system_item cis_idi;
+	unsigned char         reserved[24];
+} __packed;
+
+struct ms_boot_attr_info {
+	unsigned char      memorystick_class;
+	unsigned char      format_unique_value1;
+	unsigned short     block_size;
+	unsigned short     number_of_blocks;
+	unsigned short     number_of_effective_blocks;
+	unsigned short     page_size;
+	unsigned char      extra_data_size;
+	unsigned char      format_unique_value2;
+	unsigned char      assembly_time[8];
+	unsigned char      format_unique_value3;
+	unsigned char      serial_number[3];
+	unsigned char      assembly_manufacturer_code;
+	unsigned char      assembly_model_code[3];
+	unsigned short     memory_manufacturer_code;
+	unsigned short     memory_device_code;
+	unsigned short     implemented_capacity;
+	unsigned char      format_unique_value4[2];
+	unsigned char      vcc;
+	unsigned char      vpp;
+	unsigned short     controller_number;
+	unsigned short     controller_function;
+	unsigned char      reserved0[9];
+	unsigned char      transfer_supporting;
+	unsigned short     format_unique_value5;
+	unsigned char      format_type;
+	unsigned char      memorystick_application;
+	unsigned char      device_type;
+	unsigned char      reserved1[22];
+	unsigned char      format_uniqure_value6[2];
+	unsigned char      reserved2[15];
+} __packed;
+
+struct ms_cis_idi {
+	unsigned short general_config;
+	unsigned short logical_cylinders;
+	unsigned short reserved0;
+	unsigned short logical_heads;
+	unsigned short track_size;
+	unsigned short page_size;
+	unsigned short pages_per_track;
+	unsigned short msw;
+	unsigned short lsw;
+	unsigned short reserved1;
+	unsigned char  serial_number[20];
+	unsigned short buffer_type;
+	unsigned short buffer_size_increments;
+	unsigned short long_command_ecc;
+	unsigned char  firmware_version[28];
+	unsigned char  model_name[18];
+	unsigned short reserved2[5];
+	unsigned short pio_mode_number;
+	unsigned short dma_mode_number;
+	unsigned short field_validity;
+	unsigned short current_logical_cylinders;
+	unsigned short current_logical_heads;
+	unsigned short current_pages_per_track;
+	unsigned int   current_page_capacity;
+	unsigned short mutiple_page_setting;
+	unsigned int   addressable_pages;
+	unsigned short single_word_dma;
+	unsigned short multi_word_dma;
+	unsigned char  reserved3[128];
+} __packed;
+
+
+struct ms_boot_page {
+	struct ms_boot_header    header;
+	struct ms_system_entry   entry;
+	struct ms_boot_attr_info attr;
+} __packed;
+
+struct msb_data {
+	unsigned int			usage_count;
+	struct memstick_dev		*card;
+	struct gendisk			*disk;
+	struct request_queue		*queue;
+	spinlock_t			q_lock;
+	struct hd_geometry		geometry;
+	struct attribute_group		attr_group;
+	struct request			*req;
+	int				caps;
+	int				disk_id;
+
+	/* IO */
+	struct workqueue_struct		*io_queue;
+	bool				io_queue_stopped;
+	struct work_struct		io_work;
+	bool				card_dead;
+
+	/* Media properties */
+	struct ms_boot_page		*boot_page;
+	u16				boot_block_locations[2];
+	int				boot_block_count;
+
+	bool				read_only;
+	unsigned short			page_size;
+	int				block_size;
+	int				pages_in_block;
+	int				zone_count;
+	int				block_count;
+	int				logical_block_count;
+
+	/* FTL tables */
+	unsigned long			*used_blocks_bitmap;
+	unsigned long			*erased_blocks_bitmap;
+	u16				*lba_to_pba_table;
+	int				free_block_count[MS_MAX_ZONES];
+	bool				ftl_initialized;
+
+	/* Cache */
+	unsigned char			*cache;
+	unsigned long			valid_cache_bitmap;
+	int				cache_block_lba;
+	bool				need_flush_cache;
+	struct timer_list		cache_flush_timer;
+
+	/* Preallocated buffers */
+	unsigned char			*block_buffer;
+	struct scatterlist		prealloc_sg[MS_BLOCK_MAX_SEGS+1];
+
+
+	/* handler's local data */
+	struct ms_register_addr		reg_addr;
+	bool				addr_valid;
+
+	u8				command_value;
+	bool				command_need_oob;
+	struct scatterlist		*current_sg;
+	int				current_sg_offset;
+
+	struct ms_register		regs;
+	int				current_page;
+
+	int				state;
+	int				exit_error;
+	bool				int_polling;
+	unsigned long			int_timeout;
+
+};
+
+enum msb_readpage_states {
+	MSB_RP_SEND_BLOCK_ADDRESS = 0,
+	MSB_RP_SEND_READ_COMMAND,
+
+	MSB_RP_SEND_INT_REQ,
+	MSB_RP_RECEIVE_INT_REQ_RESULT,
+
+	MSB_RP_SEND_READ_STATUS_REG,
+	MSB_RP_RECIVE_STATUS_REG,
+
+	MSB_RP_SEND_OOB_READ,
+	MSB_RP_RECEIVE_OOB_READ,
+
+	MSB_RP_SEND_READ_DATA,
+	MSB_RP_RECEIVE_READ_DATA,
+};
+
+enum msb_write_block_states {
+	MSB_WB_SEND_WRITE_PARAMS = 0,
+	MSB_WB_SEND_WRITE_OOB,
+	MSB_WB_SEND_WRITE_COMMAND,
+
+	MSB_WB_SEND_INT_REQ,
+	MSB_WB_RECEIVE_INT_REQ,
+
+	MSB_WB_SEND_WRITE_DATA,
+	MSB_WB_RECEIVE_WRITE_CONFIRMATION,
+};
+
+enum msb_send_command_states {
+	MSB_SC_SEND_WRITE_PARAMS,
+	MSB_SC_SEND_WRITE_OOB,
+	MSB_SC_SEND_COMMAND,
+
+	MSB_SC_SEND_INT_REQ,
+	MSB_SC_RECEIVE_INT_REQ,
+
+};
+
+enum msb_reset_states {
+	MSB_RS_SEND,
+	MSB_RS_CONFIRM,
+};
+
+enum msb_par_switch_states {
+	MSB_PS_SEND_SWITCH_COMMAND,
+	MSB_PS_SWICH_HOST,
+	MSB_PS_CONFIRM,
+};
+
+struct chs_entry {
+	unsigned long size;
+	unsigned char sec;
+	unsigned short cyl;
+	unsigned char head;
+};
+
+static int msb_reset(struct msb_data *msb, bool full);
+
+static int h_msb_default_bad(struct memstick_dev *card,
+						struct memstick_request **mrq);
+
+#define __dbg(level, format, ...) \
+	do { \
+		if (debug >= level) \
+			pr_err(format "\n", ## __VA_ARGS__); \
+	} while (0)
+
+
+#define dbg(format, ...)		__dbg(1, format, ## __VA_ARGS__)
+#define dbg_verbose(format, ...)	__dbg(2, format, ## __VA_ARGS__)
+
+#endif

From bf4228f0ef662f11252f8cde7fa92979e4ac6b69 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:26:03 -0700
Subject: [PATCH 280/303] drivers/w1/w1.c: replace strict_strtol() with
 kstrtol()

The usage of strict_strtol() is not preferred, because strict_strtol() is
obsolete.  Thus, kstrtol() should be used.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/w1/w1.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
index 22013ca2119c..c7c64f18773d 100644
--- a/drivers/w1/w1.c
+++ b/drivers/w1/w1.c
@@ -234,9 +234,11 @@ static ssize_t w1_master_attribute_store_search(struct device * dev,
 {
 	long tmp;
 	struct w1_master *md = dev_to_w1_master(dev);
+	int ret;
 
-	if (strict_strtol(buf, 0, &tmp) == -EINVAL)
-		return -EINVAL;
+	ret = kstrtol(buf, 0, &tmp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&md->mutex);
 	md->search_count = tmp;
@@ -266,9 +268,11 @@ static ssize_t w1_master_attribute_store_pullup(struct device *dev,
 {
 	long tmp;
 	struct w1_master *md = dev_to_w1_master(dev);
+	int ret;
 
-	if (strict_strtol(buf, 0, &tmp) == -EINVAL)
-		return -EINVAL;
+	ret = kstrtol(buf, 0, &tmp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&md->mutex);
 	md->enable_pullup = tmp;

From 4b39248365e09fb8268b6fecd1704907ffc3d980 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Wed, 11 Sep 2013 14:26:04 -0700
Subject: [PATCH 281/303] drivers/w1/masters/mxc_w1.c: remove unnecessary
 platform_set_drvdata()

The driver core clears the driver data to NULL after device_release or on
probe failure.  Thus, it is not needed to manually clear the device driver
data to NULL.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Cc: Greg KH <greg@kroah.com>
Acked-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/w1/masters/mxc_w1.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c
index 47e12cfc2a57..15c7251b0556 100644
--- a/drivers/w1/masters/mxc_w1.c
+++ b/drivers/w1/masters/mxc_w1.c
@@ -152,8 +152,6 @@ static int mxc_w1_remove(struct platform_device *pdev)
 
 	clk_disable_unprepare(mdev->clk);
 
-	platform_set_drvdata(pdev, NULL);
-
 	return 0;
 }
 

From 5e4c0d974139a98741b829b27cf38dc8f9284490 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Sep 2013 14:26:05 -0700
Subject: [PATCH 282/303] lib/radix-tree.c: make radix_tree_node_alloc() work
 correctly within interrupt

With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is
one such possible user), the following race can happen:

radix_tree_preload()
...
radix_tree_insert()
  radix_tree_node_alloc()
    if (rtp->nr) {
      ret = rtp->nodes[rtp->nr - 1];
<interrupt>
...
radix_tree_preload()
...
radix_tree_insert()
  radix_tree_node_alloc()
    if (rtp->nr) {
      ret = rtp->nodes[rtp->nr - 1];

And we give out one radix tree node twice.  That clearly results in radix
tree corruption with different results (usually OOPS) depending on which
two users of radix tree race.

We fix the problem by making radix_tree_node_alloc() always allocate fresh
radix tree nodes when in interrupt.  Using preloading when in interrupt
doesn't make sense since all the allocations have to be atomic anyway and
we cannot steal nodes from process-context users because some users rely
on radix_tree_insert() succeeding after radix_tree_preload().
in_interrupt() check is somewhat ugly but we cannot simply key off passed
gfp_mask as that is acquired from root_gfp_mask() and thus the same for
all preload users.

Another part of the fix is to avoid node preallocation in
radix_tree_preload() when passed gfp_mask doesn't allow waiting.  Again,
preallocation in such case doesn't make sense and when preallocation would
happen in interrupt we could possibly leak some allocated nodes.  However,
some users of radix_tree_preload() require following radix_tree_insert()
to succeed.  To avoid unexpected effects for these users,
radix_tree_preload() only warns if passed gfp mask doesn't allow waiting
and we provide a new function radix_tree_maybe_preload() for those users
which get different gfp mask from different call sites and which are
prepared to handle radix_tree_insert() failure.

Signed-off-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <jaxboe@fusionio.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-ioc.c            |  2 +-
 fs/fscache/page.c          |  2 +-
 include/linux/radix-tree.h |  1 +
 lib/radix-tree.c           | 41 ++++++++++++++++++++++++++++++++++++--
 mm/filemap.c               |  2 +-
 mm/shmem.c                 |  2 +-
 mm/swap_state.c            |  4 ++--
 7 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 4464c823cff2..46cd7bd18b34 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	if (!icq)
 		return NULL;
 
-	if (radix_tree_preload(gfp_mask) < 0) {
+	if (radix_tree_maybe_preload(gfp_mask) < 0) {
 		kmem_cache_free(et->icq_cache, icq);
 		return NULL;
 	}
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 8702b732109a..73899c1c3449 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -913,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie,
 		(1 << FSCACHE_OP_WAITING) |
 		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
-	ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+	ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM);
 	if (ret < 0)
 		goto nomem_free;
 
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index ffc444c38b0a..403940787be1 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root,
 unsigned long radix_tree_prev_hole(struct radix_tree_root *root,
 				unsigned long index, unsigned long max_scan);
 int radix_tree_preload(gfp_t gfp_mask);
+int radix_tree_maybe_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e7964296fd50..7811ed3b4e70 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
+#include <linux/hardirq.h>		/* in_interrupt() */
 
 
 #ifdef __KERNEL__
@@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	struct radix_tree_node *ret = NULL;
 	gfp_t gfp_mask = root_gfp_mask(root);
 
-	if (!(gfp_mask & __GFP_WAIT)) {
+	/*
+	 * Preload code isn't irq safe and it doesn't make sence to use
+	 * preloading in the interrupt anyway as all the allocations have to
+	 * be atomic. So just do normal allocation when in interrupt.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
 		struct radix_tree_preload *rtp;
 
 		/*
@@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_WAIT being passed to INIT_RADIX_TREE().
  */
-int radix_tree_preload(gfp_t gfp_mask)
+static int __radix_tree_preload(gfp_t gfp_mask)
 {
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
@@ -288,8 +294,39 @@ int radix_tree_preload(gfp_t gfp_mask)
 out:
 	return ret;
 }
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ *
+ * To make use of this facility, the radix tree must be initialised without
+ * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+	/* Warn on non-sensical use... */
+	WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
+	return __radix_tree_preload(gfp_mask);
+}
 EXPORT_SYMBOL(radix_tree_preload);
 
+/*
+ * The same as above function, except we don't guarantee preloading happens.
+ * We do it, if we decide it helps. On success, return zero with preemption
+ * disabled. On error, return -ENOMEM with preemption not disabled.
+ */
+int radix_tree_maybe_preload(gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_WAIT)
+		return __radix_tree_preload(gfp_mask);
+	/* Preloading doesn't help anything with this gfp mask, skip it */
+	preempt_disable();
+	return 0;
+}
+EXPORT_SYMBOL(radix_tree_maybe_preload);
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
diff --git a/mm/filemap.c b/mm/filemap.c
index 731a2c24532d..e607728db4a8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 	if (error)
 		goto out;
 
-	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
 		page_cache_get(page);
 		page->mapping = mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index 526149846d0a..a1b8bf4391c2 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1205,7 +1205,7 @@ repeat:
 						gfp & GFP_RECLAIM_MASK);
 		if (error)
 			goto decused;
-		error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+		error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
 		if (!error) {
 			error = shmem_add_to_page_cache(page, mapping, index,
 							gfp, NULL);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index f24ab0dff554..e6f15f8ca2af 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 {
 	int error;
 
-	error = radix_tree_preload(gfp_mask);
+	error = radix_tree_maybe_preload(gfp_mask);
 	if (!error) {
 		error = __add_to_swap_cache(page, entry);
 		radix_tree_preload_end();
@@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		/*
 		 * call radix_tree_preload() while we can wait.
 		 */
-		err = radix_tree_preload(gfp_mask & GFP_KERNEL);
+		err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
 		if (err)
 			break;
 

From 137fdcc18a5979b53c0a1379b25fc68724e98a45 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Wed, 11 Sep 2013 14:26:06 -0700
Subject: [PATCH 283/303] initmpfs: replace MS_NOUSER in initramfs

Mounting MS_NOUSER prevents --bind mounts from rootfs.  Prevent new rootfs
mounts with a different mechanism that doesn't affect bind mounts.

Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jim Cromie <jim.cromie@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index c24f1e10b946..8f7fe323e049 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -247,7 +247,12 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type,
 static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super);
+	static unsigned long once;
+
+	if (test_and_set_bit(1, &once))
+		return ERR_PTR(-ENODEV);
+
+	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
 
 static void ramfs_kill_sb(struct super_block *sb)

From 4bbee76bc986af326be0a84ad661000cf89b29f6 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Wed, 11 Sep 2013 14:26:08 -0700
Subject: [PATCH 284/303] initmpfs: move bdi setup from init_rootfs to
 init_ramfs

Even though ramfs hasn't got a backing device, commit e0bf68ddec4f ("mm:
bdi init hooks") added one anyway, and put the initialization in
init_rootfs() since that's the first user, leaving it out of init_ramfs()
to avoid duplication.

But initmpfs uses init_tmpfs() instead, so move the init into the
filesystem's init function, add a "once" guard to prevent duplicate
initialization, and call the filesystem init from rootfs init.

This goes part of the way to allowing ramfs to be built as a module.

[akpm@linux-foundation.org; using bit 1 was odd]
Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jim Cromie <jim.cromie@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ramfs/inode.c | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8f7fe323e049..fb99863598be 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -249,7 +249,7 @@ static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 {
 	static unsigned long once;
 
-	if (test_and_set_bit(1, &once))
+	if (test_and_set_bit(0, &once))
 		return ERR_PTR(-ENODEV);
 
 	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
@@ -275,21 +275,34 @@ static struct file_system_type rootfs_fs_type = {
 
 static int __init init_ramfs_fs(void)
 {
-	return register_filesystem(&ramfs_fs_type);
-}
-module_init(init_ramfs_fs)
-
-int __init init_rootfs(void)
-{
+	static unsigned long once;
 	int err;
 
+	if (test_and_set_bit(0, &once))
+		return 0;
+
 	err = bdi_init(&ramfs_backing_dev_info);
 	if (err)
 		return err;
 
-	err = register_filesystem(&rootfs_fs_type);
+	err = register_filesystem(&ramfs_fs_type);
 	if (err)
 		bdi_destroy(&ramfs_backing_dev_info);
 
 	return err;
 }
+module_init(init_ramfs_fs)
+
+int __init init_rootfs(void)
+{
+	int err = register_filesystem(&rootfs_fs_type);
+
+	if (err)
+		return err;
+
+	err = init_ramfs_fs();
+	if (err)
+		unregister_filesystem(&rootfs_fs_type);
+
+	return err;
+}

From 57f150a58c40cda598c31af8bceb8598f43c3e5f Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Wed, 11 Sep 2013 14:26:10 -0700
Subject: [PATCH 285/303] initmpfs: move rootfs code from fs/ramfs/ to init/

When the rootfs code was a wrapper around ramfs, having them in the same
file made sense.  Now that it can wrap another filesystem type, move it in
with the init code instead.

This also allows a subsequent patch to access rootfstype= command line
arg.

Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jim Cromie <jim.cromie@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c        |  2 +-
 fs/ramfs/inode.c      | 32 +-------------------------------
 include/linux/init.h  |  1 +
 include/linux/ramfs.h |  2 +-
 init/do_mounts.c      | 32 ++++++++++++++++++++++++++++++++
 5 files changed, 36 insertions(+), 33 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 25845d1b300b..da5c49483430 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,7 +17,7 @@
 #include <linux/security.h>
 #include <linux/idr.h>
 #include <linux/acct.h>		/* acct_auto_close_mnt */
-#include <linux/ramfs.h>	/* init_rootfs */
+#include <linux/init.h>		/* init_rootfs */
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index fb99863598be..39d14659a8d3 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -244,17 +244,6 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type,
 	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
 }
 
-static struct dentry *rootfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	static unsigned long once;
-
-	if (test_and_set_bit(0, &once))
-		return ERR_PTR(-ENODEV);
-
-	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
-}
-
 static void ramfs_kill_sb(struct super_block *sb)
 {
 	kfree(sb->s_fs_info);
@@ -267,13 +256,8 @@ static struct file_system_type ramfs_fs_type = {
 	.kill_sb	= ramfs_kill_sb,
 	.fs_flags	= FS_USERNS_MOUNT,
 };
-static struct file_system_type rootfs_fs_type = {
-	.name		= "rootfs",
-	.mount		= rootfs_mount,
-	.kill_sb	= kill_litter_super,
-};
 
-static int __init init_ramfs_fs(void)
+int __init init_ramfs_fs(void)
 {
 	static unsigned long once;
 	int err;
@@ -292,17 +276,3 @@ static int __init init_ramfs_fs(void)
 	return err;
 }
 module_init(init_ramfs_fs)
-
-int __init init_rootfs(void)
-{
-	int err = register_filesystem(&rootfs_fs_type);
-
-	if (err)
-		return err;
-
-	err = init_ramfs_fs();
-	if (err)
-		unregister_filesystem(&rootfs_fs_type);
-
-	return err;
-}
diff --git a/include/linux/init.h b/include/linux/init.h
index e73f2b708525..f1c27a71d03c 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -153,6 +153,7 @@ extern unsigned int reset_devices;
 void setup_arch(char **);
 void prepare_namespace(void);
 void __init load_default_modules(void);
+int __init init_rootfs(void);
 
 extern void (*late_time_init)(void);
 
diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 69e37c2d1ea5..753207c8ce20 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -25,7 +25,7 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
 
 extern const struct file_operations ramfs_file_operations;
 extern const struct vm_operations_struct generic_file_vm_ops;
-extern int __init init_rootfs(void);
+extern int __init init_ramfs_fs(void);
 
 int ramfs_fill_super(struct super_block *sb, void *data, int silent);
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 816014c4627e..5d8d48fd0ee4 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -26,6 +26,7 @@
 #include <linux/async.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/ramfs.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
@@ -588,3 +589,34 @@ out:
 	sys_mount(".", "/", NULL, MS_MOVE, NULL);
 	sys_chroot(".");
 }
+
+static struct dentry *rootfs_mount(struct file_system_type *fs_type,
+	int flags, const char *dev_name, void *data)
+{
+	static unsigned long once;
+
+	if (test_and_set_bit(0, &once))
+		return ERR_PTR(-ENODEV);
+
+	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+}
+
+static struct file_system_type rootfs_fs_type = {
+	.name		= "rootfs",
+	.mount		= rootfs_mount,
+	.kill_sb	= kill_litter_super,
+};
+
+int __init init_rootfs(void)
+{
+	int err = register_filesystem(&rootfs_fs_type);
+
+	if (err)
+		return err;
+
+	err = init_ramfs_fs();
+	if (err)
+		unregister_filesystem(&rootfs_fs_type);
+
+	return err;
+}

From 16203a7a9422315bc929461503e3a046459ea5ff Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Wed, 11 Sep 2013 14:26:12 -0700
Subject: [PATCH 286/303] initmpfs: make rootfs use tmpfs when CONFIG_TMPFS
 enabled

Conditionally call the appropriate fs_init function and fill_super
functions.  Add a use once guard to shmem_init() to simply succeed on a
second call.

(Note that IS_ENABLED() is a compile time constant so dead code
elimination removes unused function calls when CONFIG_TMPFS is disabled.)

Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jim Cromie <jim.cromie@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/do_mounts.c | 10 ++++++++--
 mm/shmem.c       |  4 ++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/init/do_mounts.c b/init/do_mounts.c
index 5d8d48fd0ee4..e27908b949d4 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -27,6 +27,7 @@
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/ramfs.h>
+#include <linux/shmem_fs.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
@@ -598,7 +599,8 @@ static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 	if (test_and_set_bit(0, &once))
 		return ERR_PTR(-ENODEV);
 
-	return mount_nodev(fs_type, flags, data, ramfs_fill_super);
+	return mount_nodev(fs_type, flags, data,
+		IS_ENABLED(CONFIG_TMPFS) ? shmem_fill_super : ramfs_fill_super);
 }
 
 static struct file_system_type rootfs_fs_type = {
@@ -614,7 +616,11 @@ int __init init_rootfs(void)
 	if (err)
 		return err;
 
-	err = init_ramfs_fs();
+	if (IS_ENABLED(CONFIG_TMPFS))
+		err = shmem_init();
+	else
+		err = init_ramfs_fs();
+
 	if (err)
 		unregister_filesystem(&rootfs_fs_type);
 
diff --git a/mm/shmem.c b/mm/shmem.c
index a1b8bf4391c2..8297623fcaed 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2819,6 +2819,10 @@ int __init shmem_init(void)
 {
 	int error;
 
+	/* If rootfs called this, don't re-init */
+	if (shmem_inode_cachep)
+		return 0;
+
 	error = bdi_init(&shmem_backing_dev_info);
 	if (error)
 		goto out4;

From 6e19eded3684dc184181093af3bff2ff440f5b53 Mon Sep 17 00:00:00 2001
From: Rob Landley <rob@landley.net>
Date: Wed, 11 Sep 2013 14:26:13 -0700
Subject: [PATCH 287/303] initmpfs: use initramfs if rootfstype= or root=
 specified

Command line option rootfstype=ramfs to obtain old initramfs behavior, and
use ramfs instead of tmpfs for stub when root= defined (for cosmetic
reasons).

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Rob Landley <rob@landley.net>
Cc: Jeff Layton <jlayton@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Stephen Warren <swarren@nvidia.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jim Cromie <jim.cromie@gmail.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../filesystems/ramfs-rootfs-initramfs.txt        |  4 ++++
 init/do_mounts.c                                  | 15 +++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index 59b4a0962e0f..b176928e6963 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -79,6 +79,10 @@ to just make sure certain lists can't become empty.
 Most systems just mount another filesystem over rootfs and ignore it.  The
 amount of space an empty instance of ramfs takes up is tiny.
 
+If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by
+default.  To force ramfs, add "rootfstype=ramfs" to the kernel command
+line.
+
 What is initramfs?
 ------------------
 
diff --git a/init/do_mounts.c b/init/do_mounts.c
index e27908b949d4..a51cddc2ff8c 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -591,16 +591,20 @@ out:
 	sys_chroot(".");
 }
 
+static bool is_tmpfs;
 static struct dentry *rootfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
 	static unsigned long once;
+	void *fill = ramfs_fill_super;
 
 	if (test_and_set_bit(0, &once))
 		return ERR_PTR(-ENODEV);
 
-	return mount_nodev(fs_type, flags, data,
-		IS_ENABLED(CONFIG_TMPFS) ? shmem_fill_super : ramfs_fill_super);
+	if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs)
+		fill = shmem_fill_super;
+
+	return mount_nodev(fs_type, flags, data, fill);
 }
 
 static struct file_system_type rootfs_fs_type = {
@@ -616,10 +620,13 @@ int __init init_rootfs(void)
 	if (err)
 		return err;
 
-	if (IS_ENABLED(CONFIG_TMPFS))
+	if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] &&
+		(!root_fs_names || strstr(root_fs_names, "tmpfs"))) {
 		err = shmem_init();
-	else
+		is_tmpfs = true;
+	} else {
 		err = init_ramfs_fs();
+	}
 
 	if (err)
 		unregister_filesystem(&rootfs_fs_type);

From 8b8d52ac382b17a19906b930cd69e2edb0aca8ba Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:15 -0700
Subject: [PATCH 288/303] ipc,shm: introduce lockless functions to obtain the
 ipc object

This is the third and final patchset that deals with reducing the amount
of contention we impose on the ipc lock (kern_ipc_perm.lock).  These
changes mostly deal with shared memory, previous work has already been
done for semaphores and message queues:

  http://lkml.org/lkml/2013/3/20/546 (sems)
  http://lkml.org/lkml/2013/5/15/584 (mqueues)

With these patches applied, a custom shm microbenchmark stressing shmctl
doing IPC_STAT with 4 threads a million times, reduces the execution
time by 50%.  A similar run, this time with IPC_SET, reduces the
execution time from 3 mins and 35 secs to 27 seconds.

Patches 1-8: replaces blindly taking the ipc lock for a smarter
combination of rcu and ipc_obtain_object, only acquiring the spinlock
when updating.

Patch 9: renames the ids rw_mutex to rwsem, which is what it already was.

Patch 10: is a trivial mqueue leftover cleanup

Patch 11: adds a brief lock scheme description, requested by Andrew.

This patch:

Add shm_obtain_object() and shm_obtain_object_check(), which will allow us
to get the ipc object without acquiring the lock.  Just as with other
forms of ipc, these functions are basically wrappers around
ipc_obtain_object*().

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/ipc/shm.c b/ipc/shm.c
index c6b4ad5ce3b7..216ae727a936 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -124,6 +124,26 @@ void __init shm_init (void)
 				IPC_SHM_IDS, sysvipc_shm_proc_show);
 }
 
+static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
+static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id)
+{
+	struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id);
+
+	if (IS_ERR(ipcp))
+		return ERR_CAST(ipcp);
+
+	return container_of(ipcp, struct shmid_kernel, shm_perm);
+}
+
 /*
  * shm_lock_(check_) routines are called in the paths where the rw_mutex
  * is not necessarily held.

From 79ccf0f8c8e04e8b9eda6645ba0f63b0915a3075 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:16 -0700
Subject: [PATCH 289/303] ipc,shm: shorten critical region in shmctl_down

Instead of holding the ipc lock for the entire function, use the
ipcctl_pre_down_nolock and only acquire the lock for specific commands:
RMID and SET.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 216ae727a936..22cffd78dbb1 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -780,11 +780,10 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 	down_write(&shm_ids(ns).rw_mutex);
 	rcu_read_lock();
 
-	ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd,
-			       &shmid64.shm_perm, 0);
+	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
+				      &shmid64.shm_perm, 0);
 	if (IS_ERR(ipcp)) {
 		err = PTR_ERR(ipcp);
-		/* the ipc lock is not held upon failure */
 		goto out_unlock1;
 	}
 
@@ -792,14 +791,16 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 
 	err = security_shm_shmctl(shp, cmd);
 	if (err)
-		goto out_unlock0;
+		goto out_unlock1;
 
 	switch (cmd) {
 	case IPC_RMID:
+		ipc_lock_object(&shp->shm_perm);
 		/* do_shm_rmid unlocks the ipc object and rcu */
 		do_shm_rmid(ns, ipcp);
 		goto out_up;
 	case IPC_SET:
+		ipc_lock_object(&shp->shm_perm);
 		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
 		if (err)
 			goto out_unlock0;
@@ -807,6 +808,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 		break;
 	default:
 		err = -EINVAL;
+		goto out_unlock1;
 	}
 
 out_unlock0:

From 3b1c4ad37741e53804ffe0a30dd01e08b2ab6241 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:17 -0700
Subject: [PATCH 290/303] ipc: drop ipcctl_pre_down

Now that sem, msgque and shm, through *_down(), all use the lockless
variant of ipcctl_pre_down(), go ahead and delete it.

[akpm@linux-foundation.org: fix function name in kerneldoc, cleanups]
Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 24 ++++--------------------
 ipc/util.h |  3 ---
 2 files changed, 4 insertions(+), 23 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index 4704223bfad4..2c8a93b380ba 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -733,7 +733,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
 }
 
 /**
- * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd
+ * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd
  * @ns:  the ipc namespace
  * @ids:  the table of ids where to look for the ipc
  * @id:   the id of the ipc to retrieve
@@ -746,29 +746,13 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
  * It must be called without any lock held and
  *  - retrieves the ipc with the given id in the given table.
  *  - performs some audit and permission check, depending on the given cmd
- *  - returns the ipc with the ipc lock held in case of success
- *    or an err-code without any lock held otherwise.
+ *  - returns a pointer to the ipc object or otherwise, the corresponding error.
  *
  * Call holding the both the rw_mutex and the rcu read lock.
  */
-struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns,
-				      struct ipc_ids *ids, int id, int cmd,
-				      struct ipc64_perm *perm, int extra_perm)
-{
-	struct kern_ipc_perm *ipcp;
-
-	ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm);
-	if (IS_ERR(ipcp))
-		goto out;
-
-	spin_lock(&ipcp->lock);
-out:
-	return ipcp;
-}
-
 struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
-					     struct ipc_ids *ids, int id, int cmd,
-					     struct ipc64_perm *perm, int extra_perm)
+					struct ipc_ids *ids, int id, int cmd,
+					struct ipc64_perm *perm, int extra_perm)
 {
 	kuid_t euid;
 	int err = -EPERM;
diff --git a/ipc/util.h b/ipc/util.h
index b6a6a88f3002..41a6c4d26399 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -131,9 +131,6 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out);
 struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
 					     struct ipc_ids *ids, int id, int cmd,
 					     struct ipc64_perm *perm, int extra_perm);
-struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns,
-				      struct ipc_ids *ids, int id, int cmd,
-				      struct ipc64_perm *perm, int extra_perm);
 
 #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION
   /* On IA-64, we always use the "64-bit version" of the IPC structures.  */ 

From 68eccc1dc345539d589ae78ee43b835c1a06a134 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:18 -0700
Subject: [PATCH 291/303] ipc,shm: introduce shmctl_nolock

Similar to semctl and msgctl, when calling msgctl, the *_INFO and *_STAT
commands can be performed without acquiring the ipc object.

Add a shmctl_nolock() function and move the logic of *_INFO and *_STAT out
of msgctl().  Since we are just moving functionality, this change still
takes the lock and it will be properly lockless in the next patch.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 61 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 22cffd78dbb1..3e123987f054 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -820,28 +820,23 @@ out_up:
 	return err;
 }
 
-SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
+static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
+			 int cmd, int version, void __user *buf)
 {
+	int err;
 	struct shmid_kernel *shp;
-	int err, version;
-	struct ipc_namespace *ns;
-
-	if (cmd < 0 || shmid < 0) {
-		err = -EINVAL;
-		goto out;
-	}
-
-	version = ipc_parse_version(&cmd);
-	ns = current->nsproxy->ipc_ns;
-
-	switch (cmd) { /* replace with proc interface ? */
-	case IPC_INFO:
-	{
-		struct shminfo64 shminfo;
 
+	/* preliminary security checks for *_INFO */
+	if (cmd == IPC_INFO || cmd == SHM_INFO) {
 		err = security_shm_shmctl(NULL, cmd);
 		if (err)
 			return err;
+	}
+
+	switch (cmd) {
+	case IPC_INFO:
+	{
+		struct shminfo64 shminfo;
 
 		memset(&shminfo, 0, sizeof(shminfo));
 		shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni;
@@ -864,10 +859,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 	{
 		struct shm_info shm_info;
 
-		err = security_shm_shmctl(NULL, cmd);
-		if (err)
-			return err;
-
 		memset(&shm_info, 0, sizeof(shm_info));
 		down_read(&shm_ids(ns).rw_mutex);
 		shm_info.used_ids = shm_ids(ns).in_use;
@@ -928,6 +919,36 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 			err = result;
 		goto out;
 	}
+	default:
+		return -EINVAL;
+	}
+
+out_unlock:
+	shm_unlock(shp);
+out:
+	return err;
+}
+
+SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
+{
+	struct shmid_kernel *shp;
+	int err, version;
+	struct ipc_namespace *ns;
+
+	if (cmd < 0 || shmid < 0) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	version = ipc_parse_version(&cmd);
+	ns = current->nsproxy->ipc_ns;
+
+	switch (cmd) {
+	case IPC_INFO:
+	case SHM_INFO:
+	case SHM_STAT:
+	case IPC_STAT:
+		return shmctl_nolock(ns, shmid, cmd, version, buf);
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 	{

From c97cb9ccab8c85428ec21eff690642ad2ce1fa8a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:20 -0700
Subject: [PATCH 292/303] ipc,shm: make shmctl_nolock lockless

While the INFO cmd doesn't take the ipc lock, the STAT commands do acquire
it unnecessarily.  We can do the permissions and security checks only
holding the rcu lock.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 3e123987f054..a493639550d9 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -882,27 +882,31 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
 		struct shmid64_ds tbuf;
 		int result;
 
+		rcu_read_lock();
 		if (cmd == SHM_STAT) {
-			shp = shm_lock(ns, shmid);
+			shp = shm_obtain_object(ns, shmid);
 			if (IS_ERR(shp)) {
 				err = PTR_ERR(shp);
-				goto out;
+				goto out_unlock;
 			}
 			result = shp->shm_perm.id;
 		} else {
-			shp = shm_lock_check(ns, shmid);
+			shp = shm_obtain_object_check(ns, shmid);
 			if (IS_ERR(shp)) {
 				err = PTR_ERR(shp);
-				goto out;
+				goto out_unlock;
 			}
 			result = 0;
 		}
+
 		err = -EACCES;
 		if (ipcperms(ns, &shp->shm_perm, S_IRUGO))
 			goto out_unlock;
+
 		err = security_shm_shmctl(shp, cmd);
 		if (err)
 			goto out_unlock;
+
 		memset(&tbuf, 0, sizeof(tbuf));
 		kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm);
 		tbuf.shm_segsz	= shp->shm_segsz;
@@ -912,8 +916,9 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
 		tbuf.shm_cpid	= shp->shm_cprid;
 		tbuf.shm_lpid	= shp->shm_lprid;
 		tbuf.shm_nattch	= shp->shm_nattch;
-		shm_unlock(shp);
-		if(copy_shmid_to_user (buf, &tbuf, version))
+		rcu_read_unlock();
+
+		if (copy_shmid_to_user(buf, &tbuf, version))
 			err = -EFAULT;
 		else
 			err = result;
@@ -924,7 +929,7 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
 	}
 
 out_unlock:
-	shm_unlock(shp);
+	rcu_read_unlock();
 out:
 	return err;
 }

From 2caacaa82a51b78fc0c800e206473874094287ed Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:21 -0700
Subject: [PATCH 293/303] ipc,shm: shorten critical region for shmctl

With the *_INFO, *_STAT, IPC_RMID and IPC_SET commands already optimized,
deal with the remaining SHM_LOCK and SHM_UNLOCK commands.  Take the
shm_perm lock after doing the initial auditing and security checks.  The
rest of the logic remains unchanged.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 49 +++++++++++++++++++++++++------------------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index a493639550d9..8ec381085dec 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -940,10 +940,8 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 	int err, version;
 	struct ipc_namespace *ns;
 
-	if (cmd < 0 || shmid < 0) {
-		err = -EINVAL;
-		goto out;
-	}
+	if (cmd < 0 || shmid < 0)
+		return -EINVAL;
 
 	version = ipc_parse_version(&cmd);
 	ns = current->nsproxy->ipc_ns;
@@ -954,36 +952,40 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 	case SHM_STAT:
 	case IPC_STAT:
 		return shmctl_nolock(ns, shmid, cmd, version, buf);
+	case IPC_RMID:
+	case IPC_SET:
+		return shmctl_down(ns, shmid, cmd, buf, version);
 	case SHM_LOCK:
 	case SHM_UNLOCK:
 	{
 		struct file *shm_file;
 
-		shp = shm_lock_check(ns, shmid);
+		rcu_read_lock();
+		shp = shm_obtain_object_check(ns, shmid);
 		if (IS_ERR(shp)) {
 			err = PTR_ERR(shp);
-			goto out;
+			goto out_unlock1;
 		}
 
 		audit_ipc_obj(&(shp->shm_perm));
+		err = security_shm_shmctl(shp, cmd);
+		if (err)
+			goto out_unlock1;
 
+		ipc_lock_object(&shp->shm_perm);
 		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
 			kuid_t euid = current_euid();
 			err = -EPERM;
 			if (!uid_eq(euid, shp->shm_perm.uid) &&
 			    !uid_eq(euid, shp->shm_perm.cuid))
-				goto out_unlock;
+				goto out_unlock0;
 			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK))
-				goto out_unlock;
+				goto out_unlock0;
 		}
 
-		err = security_shm_shmctl(shp, cmd);
-		if (err)
-			goto out_unlock;
-
 		shm_file = shp->shm_file;
 		if (is_file_hugepages(shm_file))
-			goto out_unlock;
+			goto out_unlock0;
 
 		if (cmd == SHM_LOCK) {
 			struct user_struct *user = current_user();
@@ -992,32 +994,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
 				shp->shm_perm.mode |= SHM_LOCKED;
 				shp->mlock_user = user;
 			}
-			goto out_unlock;
+			goto out_unlock0;
 		}
 
 		/* SHM_UNLOCK */
 		if (!(shp->shm_perm.mode & SHM_LOCKED))
-			goto out_unlock;
+			goto out_unlock0;
 		shmem_lock(shm_file, 0, shp->mlock_user);
 		shp->shm_perm.mode &= ~SHM_LOCKED;
 		shp->mlock_user = NULL;
 		get_file(shm_file);
-		shm_unlock(shp);
+		ipc_unlock_object(&shp->shm_perm);
+		rcu_read_unlock();
 		shmem_unlock_mapping(shm_file->f_mapping);
+
 		fput(shm_file);
-		goto out;
-	}
-	case IPC_RMID:
-	case IPC_SET:
-		err = shmctl_down(ns, shmid, cmd, buf, version);
 		return err;
+	}
 	default:
 		return -EINVAL;
 	}
 
-out_unlock:
-	shm_unlock(shp);
-out:
+out_unlock0:
+	ipc_unlock_object(&shp->shm_perm);
+out_unlock1:
+	rcu_read_unlock();
 	return err;
 }
 

From f42569b1388b1408b574a5e93a23a663647d4181 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:22 -0700
Subject: [PATCH 294/303] ipc,shm: cleanup do_shmat pasta

Clean up some of the messy do_shmat() spaghetti code, getting rid of
out_free and out_put_dentry labels.  This makes shortening the critical
region of this function in the next patch a little easier to do and read.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 8ec381085dec..115dccebc63e 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1108,16 +1108,21 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 
 	err = -ENOMEM;
 	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
-	if (!sfd)
-		goto out_put_dentry;
+	if (!sfd) {
+		path_put(&path);
+		goto out_nattch;
+	}
 
 	file = alloc_file(&path, f_mode,
 			  is_file_hugepages(shp->shm_file) ?
 				&shm_file_operations_huge :
 				&shm_file_operations);
 	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out_free;
+	if (IS_ERR(file)) {
+		kfree(sfd);
+		path_put(&path);
+		goto out_nattch;
+	}
 
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
@@ -1143,7 +1148,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 		    addr > current->mm->start_stack - size - PAGE_SIZE * 5)
 			goto invalid;
 	}
-		
+
 	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
 	*raddr = addr;
 	err = 0;
@@ -1167,19 +1172,12 @@ out_nattch:
 	else
 		shm_unlock(shp);
 	up_write(&shm_ids(ns).rw_mutex);
-
-out:
 	return err;
 
 out_unlock:
 	shm_unlock(shp);
-	goto out;
-
-out_free:
-	kfree(sfd);
-out_put_dentry:
-	path_put(&path);
-	goto out_nattch;
+out:
+	return err;
 }
 
 SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)

From c2c737a0461e61a34676bd0bd1bc1a70a1b4e396 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:23 -0700
Subject: [PATCH 295/303] ipc,shm: shorten critical region for shmat

Similar to other system calls, acquire the kern_ipc_perm lock after doing
the initial permission and security checks.

[sasha.levin@oracle.com: dont leave do_shmat with rcu lock held]
Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index 115dccebc63e..28d19f4ece4b 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -19,6 +19,9 @@
  * namespaces support
  * OpenVZ, SWsoft Inc.
  * Pavel Emelianov <xemul@openvz.org>
+ *
+ * Better ipc lock (kern_ipc_perm.lock) handling
+ * Davidlohr Bueso <davidlohr.bueso@hp.com>, June 2013.
  */
 
 #include <linux/slab.h>
@@ -1086,10 +1089,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 	 * additional creator id...
 	 */
 	ns = current->nsproxy->ipc_ns;
-	shp = shm_lock_check(ns, shmid);
+	rcu_read_lock();
+	shp = shm_obtain_object_check(ns, shmid);
 	if (IS_ERR(shp)) {
 		err = PTR_ERR(shp);
-		goto out;
+		goto out_unlock;
 	}
 
 	err = -EACCES;
@@ -1100,11 +1104,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 	if (err)
 		goto out_unlock;
 
+	ipc_lock_object(&shp->shm_perm);
 	path = shp->shm_file->f_path;
 	path_get(&path);
 	shp->shm_nattch++;
 	size = i_size_read(path.dentry->d_inode);
-	shm_unlock(shp);
+	ipc_unlock_object(&shp->shm_perm);
+	rcu_read_unlock();
 
 	err = -ENOMEM;
 	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
@@ -1175,7 +1181,7 @@ out_nattch:
 	return err;
 
 out_unlock:
-	shm_unlock(shp);
+	rcu_read_unlock();
 out:
 	return err;
 }

From d9a605e40b1376eb02b067d7690580255a0df68f Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:24 -0700
Subject: [PATCH 296/303] ipc: rename ids->rw_mutex

Since in some situations the lock can be shared for readers, we shouldn't
be calling it a mutex, rename it to rwsem.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h |  2 +-
 ipc/msg.c                     | 20 ++++++-------
 ipc/namespace.c               |  4 +--
 ipc/sem.c                     | 24 +++++++--------
 ipc/shm.c                     | 56 +++++++++++++++++------------------
 ipc/util.c                    | 28 +++++++++---------
 ipc/util.h                    |  4 +--
 7 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index c4d870b0d5e6..19c19a5eee29 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -22,7 +22,7 @@ struct ipc_ids {
 	int in_use;
 	unsigned short seq;
 	unsigned short seq_max;
-	struct rw_semaphore rw_mutex;
+	struct rw_semaphore rwsem;
 	struct idr ipcs_idr;
 	int next_id;
 };
diff --git a/ipc/msg.c b/ipc/msg.c
index b65fdf1a09dd..8203e71bcfbc 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -172,7 +172,7 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s)
  * @ns: namespace
  * @params: ptr to the structure that contains the key and msgflg
  *
- * Called with msg_ids.rw_mutex held (writer)
+ * Called with msg_ids.rwsem held (writer)
  */
 static int newque(struct ipc_namespace *ns, struct ipc_params *params)
 {
@@ -259,8 +259,8 @@ static void expunge_all(struct msg_queue *msq, int res)
  * removes the message queue from message queue ID IDR, and cleans up all the
  * messages associated with this queue.
  *
- * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held
- * before freeque() is called. msg_ids.rw_mutex remains locked on exit.
+ * msg_ids.rwsem (writer) and the spinlock for this message queue are held
+ * before freeque() is called. msg_ids.rwsem remains locked on exit.
  */
 static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
@@ -282,7 +282,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 }
 
 /*
- * Called with msg_ids.rw_mutex and ipcp locked.
+ * Called with msg_ids.rwsem and ipcp locked.
  */
 static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg)
 {
@@ -386,9 +386,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version)
 }
 
 /*
- * This function handles some msgctl commands which require the rw_mutex
+ * This function handles some msgctl commands which require the rwsem
  * to be held in write mode.
- * NOTE: no locks must be held, the rw_mutex is taken inside this function.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
  */
 static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 		       struct msqid_ds __user *buf, int version)
@@ -403,7 +403,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd,
 			return -EFAULT;
 	}
 
-	down_write(&msg_ids(ns).rw_mutex);
+	down_write(&msg_ids(ns).rwsem);
 	rcu_read_lock();
 
 	ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd,
@@ -459,7 +459,7 @@ out_unlock0:
 out_unlock1:
 	rcu_read_unlock();
 out_up:
-	up_write(&msg_ids(ns).rw_mutex);
+	up_write(&msg_ids(ns).rwsem);
 	return err;
 }
 
@@ -494,7 +494,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
 		msginfo.msgmnb = ns->msg_ctlmnb;
 		msginfo.msgssz = MSGSSZ;
 		msginfo.msgseg = MSGSEG;
-		down_read(&msg_ids(ns).rw_mutex);
+		down_read(&msg_ids(ns).rwsem);
 		if (cmd == MSG_INFO) {
 			msginfo.msgpool = msg_ids(ns).in_use;
 			msginfo.msgmap = atomic_read(&ns->msg_hdrs);
@@ -505,7 +505,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid,
 			msginfo.msgtql = MSGTQL;
 		}
 		max_id = ipc_get_maxid(&msg_ids(ns));
-		up_read(&msg_ids(ns).rw_mutex);
+		up_read(&msg_ids(ns).rwsem);
 		if (copy_to_user(buf, &msginfo, sizeof(struct msginfo)))
 			return -EFAULT;
 		return (max_id < 0) ? 0 : max_id;
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 4be6581d3b7f..d43d9384bb2d 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -81,7 +81,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 	int next_id;
 	int total, in_use;
 
-	down_write(&ids->rw_mutex);
+	down_write(&ids->rwsem);
 
 	in_use = ids->in_use;
 
@@ -93,7 +93,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 		free(ns, perm);
 		total++;
 	}
-	up_write(&ids->rw_mutex);
+	up_write(&ids->rwsem);
 }
 
 static void free_ipc_ns(struct ipc_namespace *ns)
diff --git a/ipc/sem.c b/ipc/sem.c
index 41088899783d..69b6a21f3844 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -322,7 +322,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum)
 }
 
 /*
- * sem_lock_(check_) routines are called in the paths where the rw_mutex
+ * sem_lock_(check_) routines are called in the paths where the rwsem
  * is not held.
  *
  * The caller holds the RCU read lock.
@@ -426,7 +426,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s)
  * @ns: namespace
  * @params: ptr to the structure that contains key, semflg and nsems
  *
- * Called with sem_ids.rw_mutex held (as a writer)
+ * Called with sem_ids.rwsem held (as a writer)
  */
 
 static int newary(struct ipc_namespace *ns, struct ipc_params *params)
@@ -492,7 +492,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params)
 
 
 /*
- * Called with sem_ids.rw_mutex and ipcp locked.
+ * Called with sem_ids.rwsem and ipcp locked.
  */
 static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
 {
@@ -503,7 +503,7 @@ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg)
 }
 
 /*
- * Called with sem_ids.rw_mutex and ipcp locked.
+ * Called with sem_ids.rwsem and ipcp locked.
  */
 static inline int sem_more_checks(struct kern_ipc_perm *ipcp,
 				struct ipc_params *params)
@@ -994,8 +994,8 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum)
 	return semzcnt;
 }
 
-/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked
- * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex
+/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked
+ * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem
  * remains locked on exit.
  */
 static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
@@ -1116,7 +1116,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
 		seminfo.semmnu = SEMMNU;
 		seminfo.semmap = SEMMAP;
 		seminfo.semume = SEMUME;
-		down_read(&sem_ids(ns).rw_mutex);
+		down_read(&sem_ids(ns).rwsem);
 		if (cmd == SEM_INFO) {
 			seminfo.semusz = sem_ids(ns).in_use;
 			seminfo.semaem = ns->used_sems;
@@ -1125,7 +1125,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
 			seminfo.semaem = SEMAEM;
 		}
 		max_id = ipc_get_maxid(&sem_ids(ns));
-		up_read(&sem_ids(ns).rw_mutex);
+		up_read(&sem_ids(ns).rwsem);
 		if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) 
 			return -EFAULT;
 		return (max_id < 0) ? 0: max_id;
@@ -1431,9 +1431,9 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
 }
 
 /*
- * This function handles some semctl commands which require the rw_mutex
+ * This function handles some semctl commands which require the rwsem
  * to be held in write mode.
- * NOTE: no locks must be held, the rw_mutex is taken inside this function.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
  */
 static int semctl_down(struct ipc_namespace *ns, int semid,
 		       int cmd, int version, void __user *p)
@@ -1448,7 +1448,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
 			return -EFAULT;
 	}
 
-	down_write(&sem_ids(ns).rw_mutex);
+	down_write(&sem_ids(ns).rwsem);
 	rcu_read_lock();
 
 	ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd,
@@ -1487,7 +1487,7 @@ out_unlock0:
 out_unlock1:
 	rcu_read_unlock();
 out_up:
-	up_write(&sem_ids(ns).rw_mutex);
+	up_write(&sem_ids(ns).rwsem);
 	return err;
 }
 
diff --git a/ipc/shm.c b/ipc/shm.c
index 28d19f4ece4b..cb2cedaa8808 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -83,8 +83,8 @@ void shm_init_ns(struct ipc_namespace *ns)
 }
 
 /*
- * Called with shm_ids.rw_mutex (writer) and the shp structure locked.
- * Only shm_ids.rw_mutex remains locked on exit.
+ * Called with shm_ids.rwsem (writer) and the shp structure locked.
+ * Only shm_ids.rwsem remains locked on exit.
  */
 static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 {
@@ -148,7 +148,7 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace
 }
 
 /*
- * shm_lock_(check_) routines are called in the paths where the rw_mutex
+ * shm_lock_(check_) routines are called in the paths where the rwsem
  * is not necessarily held.
  */
 static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
@@ -205,7 +205,7 @@ static void shm_open(struct vm_area_struct *vma)
  * @ns: namespace
  * @shp: struct to free
  *
- * It has to be called with shp and shm_ids.rw_mutex (writer) locked,
+ * It has to be called with shp and shm_ids.rwsem (writer) locked,
  * but returns with shp unlocked and freed.
  */
 static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
@@ -253,7 +253,7 @@ static void shm_close(struct vm_area_struct *vma)
 	struct shmid_kernel *shp;
 	struct ipc_namespace *ns = sfd->ns;
 
-	down_write(&shm_ids(ns).rw_mutex);
+	down_write(&shm_ids(ns).rwsem);
 	/* remove from the list of attaches of the shm segment */
 	shp = shm_lock(ns, sfd->id);
 	BUG_ON(IS_ERR(shp));
@@ -264,10 +264,10 @@ static void shm_close(struct vm_area_struct *vma)
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
-	up_write(&shm_ids(ns).rw_mutex);
+	up_write(&shm_ids(ns).rwsem);
 }
 
-/* Called with ns->shm_ids(ns).rw_mutex locked */
+/* Called with ns->shm_ids(ns).rwsem locked */
 static int shm_try_destroy_current(int id, void *p, void *data)
 {
 	struct ipc_namespace *ns = data;
@@ -298,7 +298,7 @@ static int shm_try_destroy_current(int id, void *p, void *data)
 	return 0;
 }
 
-/* Called with ns->shm_ids(ns).rw_mutex locked */
+/* Called with ns->shm_ids(ns).rwsem locked */
 static int shm_try_destroy_orphaned(int id, void *p, void *data)
 {
 	struct ipc_namespace *ns = data;
@@ -309,7 +309,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
 	 * We want to destroy segments without users and with already
 	 * exit'ed originating process.
 	 *
-	 * As shp->* are changed under rw_mutex, it's safe to skip shp locking.
+	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
 	 */
 	if (shp->shm_creator != NULL)
 		return 0;
@@ -323,10 +323,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data)
 
 void shm_destroy_orphaned(struct ipc_namespace *ns)
 {
-	down_write(&shm_ids(ns).rw_mutex);
+	down_write(&shm_ids(ns).rwsem);
 	if (shm_ids(ns).in_use)
 		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns);
-	up_write(&shm_ids(ns).rw_mutex);
+	up_write(&shm_ids(ns).rwsem);
 }
 
 
@@ -338,10 +338,10 @@ void exit_shm(struct task_struct *task)
 		return;
 
 	/* Destroy all already created segments, but not mapped yet */
-	down_write(&shm_ids(ns).rw_mutex);
+	down_write(&shm_ids(ns).rwsem);
 	if (shm_ids(ns).in_use)
 		idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns);
-	up_write(&shm_ids(ns).rw_mutex);
+	up_write(&shm_ids(ns).rwsem);
 }
 
 static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -475,7 +475,7 @@ static const struct vm_operations_struct shm_vm_ops = {
  * @ns: namespace
  * @params: ptr to the structure that contains key, size and shmflg
  *
- * Called with shm_ids.rw_mutex held as a writer.
+ * Called with shm_ids.rwsem held as a writer.
  */
 
 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
@@ -583,7 +583,7 @@ no_file:
 }
 
 /*
- * Called with shm_ids.rw_mutex and ipcp locked.
+ * Called with shm_ids.rwsem and ipcp locked.
  */
 static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
 {
@@ -594,7 +594,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg)
 }
 
 /*
- * Called with shm_ids.rw_mutex and ipcp locked.
+ * Called with shm_ids.rwsem and ipcp locked.
  */
 static inline int shm_more_checks(struct kern_ipc_perm *ipcp,
 				struct ipc_params *params)
@@ -707,7 +707,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf
 
 /*
  * Calculate and add used RSS and swap pages of a shm.
- * Called with shm_ids.rw_mutex held as a reader
+ * Called with shm_ids.rwsem held as a reader
  */
 static void shm_add_rss_swap(struct shmid_kernel *shp,
 	unsigned long *rss_add, unsigned long *swp_add)
@@ -734,7 +734,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp,
 }
 
 /*
- * Called with shm_ids.rw_mutex held as a reader
+ * Called with shm_ids.rwsem held as a reader
  */
 static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 		unsigned long *swp)
@@ -763,9 +763,9 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss,
 }
 
 /*
- * This function handles some shmctl commands which require the rw_mutex
+ * This function handles some shmctl commands which require the rwsem
  * to be held in write mode.
- * NOTE: no locks must be held, the rw_mutex is taken inside this function.
+ * NOTE: no locks must be held, the rwsem is taken inside this function.
  */
 static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 		       struct shmid_ds __user *buf, int version)
@@ -780,7 +780,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
 			return -EFAULT;
 	}
 
-	down_write(&shm_ids(ns).rw_mutex);
+	down_write(&shm_ids(ns).rwsem);
 	rcu_read_lock();
 
 	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
@@ -819,7 +819,7 @@ out_unlock0:
 out_unlock1:
 	rcu_read_unlock();
 out_up:
-	up_write(&shm_ids(ns).rw_mutex);
+	up_write(&shm_ids(ns).rwsem);
 	return err;
 }
 
@@ -850,9 +850,9 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
 		if(copy_shminfo_to_user (buf, &shminfo, version))
 			return -EFAULT;
 
-		down_read(&shm_ids(ns).rw_mutex);
+		down_read(&shm_ids(ns).rwsem);
 		err = ipc_get_maxid(&shm_ids(ns));
-		up_read(&shm_ids(ns).rw_mutex);
+		up_read(&shm_ids(ns).rwsem);
 
 		if(err<0)
 			err = 0;
@@ -863,14 +863,14 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid,
 		struct shm_info shm_info;
 
 		memset(&shm_info, 0, sizeof(shm_info));
-		down_read(&shm_ids(ns).rw_mutex);
+		down_read(&shm_ids(ns).rwsem);
 		shm_info.used_ids = shm_ids(ns).in_use;
 		shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp);
 		shm_info.shm_tot = ns->shm_tot;
 		shm_info.swap_attempts = 0;
 		shm_info.swap_successes = 0;
 		err = ipc_get_maxid(&shm_ids(ns));
-		up_read(&shm_ids(ns).rw_mutex);
+		up_read(&shm_ids(ns).rwsem);
 		if (copy_to_user(buf, &shm_info, sizeof(shm_info))) {
 			err = -EFAULT;
 			goto out;
@@ -1169,7 +1169,7 @@ out_fput:
 	fput(file);
 
 out_nattch:
-	down_write(&shm_ids(ns).rw_mutex);
+	down_write(&shm_ids(ns).rwsem);
 	shp = shm_lock(ns, shmid);
 	BUG_ON(IS_ERR(shp));
 	shp->shm_nattch--;
@@ -1177,7 +1177,7 @@ out_nattch:
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
-	up_write(&shm_ids(ns).rw_mutex);
+	up_write(&shm_ids(ns).rwsem);
 	return err;
 
 out_unlock:
diff --git a/ipc/util.c b/ipc/util.c
index 2c8a93b380ba..9a1d779a20e2 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -119,7 +119,7 @@ __initcall(ipc_init);
  
 void ipc_init_ids(struct ipc_ids *ids)
 {
-	init_rwsem(&ids->rw_mutex);
+	init_rwsem(&ids->rwsem);
 
 	ids->in_use = 0;
 	ids->seq = 0;
@@ -174,7 +174,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
  *	@ids: Identifier set
  *	@key: The key to find
  *	
- *	Requires ipc_ids.rw_mutex locked.
+ *	Requires ipc_ids.rwsem locked.
  *	Returns the LOCKED pointer to the ipc structure if found or NULL
  *	if not.
  *	If key is found ipc points to the owning ipc structure
@@ -208,7 +208,7 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
  *	ipc_get_maxid 	-	get the last assigned id
  *	@ids: IPC identifier set
  *
- *	Called with ipc_ids.rw_mutex held.
+ *	Called with ipc_ids.rwsem held.
  */
 
 int ipc_get_maxid(struct ipc_ids *ids)
@@ -246,7 +246,7 @@ int ipc_get_maxid(struct ipc_ids *ids)
  *	is returned. The 'new' entry is returned in a locked state on success.
  *	On failure the entry is not locked and a negative err-code is returned.
  *
- *	Called with writer ipc_ids.rw_mutex held.
+ *	Called with writer ipc_ids.rwsem held.
  */
 int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size)
 {
@@ -312,9 +312,9 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
 {
 	int err;
 
-	down_write(&ids->rw_mutex);
+	down_write(&ids->rwsem);
 	err = ops->getnew(ns, params);
-	up_write(&ids->rw_mutex);
+	up_write(&ids->rwsem);
 	return err;
 }
 
@@ -331,7 +331,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids,
  *
  *	On success, the IPC id is returned.
  *
- *	It is called with ipc_ids.rw_mutex and ipcp->lock held.
+ *	It is called with ipc_ids.rwsem and ipcp->lock held.
  */
 static int ipc_check_perms(struct ipc_namespace *ns,
 			   struct kern_ipc_perm *ipcp,
@@ -376,7 +376,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
 	 * Take the lock as a writer since we are potentially going to add
 	 * a new entry + read locks are not "upgradable"
 	 */
-	down_write(&ids->rw_mutex);
+	down_write(&ids->rwsem);
 	ipcp = ipc_findkey(ids, params->key);
 	if (ipcp == NULL) {
 		/* key not used */
@@ -402,7 +402,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
 		}
 		ipc_unlock(ipcp);
 	}
-	up_write(&ids->rw_mutex);
+	up_write(&ids->rwsem);
 
 	return err;
 }
@@ -413,7 +413,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
  *	@ids: IPC identifier set
  *	@ipcp: ipc perm structure containing the identifier to remove
  *
- *	ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held
+ *	ipc_ids.rwsem (as a writer) and the spinlock for this ID are held
  *	before this function is called, and remain locked on the exit.
  */
  
@@ -621,7 +621,7 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id)
 }
 
 /**
- * ipc_lock - Lock an ipc structure without rw_mutex held
+ * ipc_lock - Lock an ipc structure without rwsem held
  * @ids: IPC identifier set
  * @id: ipc id to look for
  *
@@ -748,7 +748,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out)
  *  - performs some audit and permission check, depending on the given cmd
  *  - returns a pointer to the ipc object or otherwise, the corresponding error.
  *
- * Call holding the both the rw_mutex and the rcu read lock.
+ * Call holding the both the rwsem and the rcu read lock.
  */
 struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns,
 					struct ipc_ids *ids, int id, int cmd,
@@ -868,7 +868,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos)
 	 * Take the lock - this will be released by the corresponding
 	 * call to stop().
 	 */
-	down_read(&ids->rw_mutex);
+	down_read(&ids->rwsem);
 
 	/* pos < 0 is invalid */
 	if (*pos < 0)
@@ -895,7 +895,7 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it)
 
 	ids = &iter->ns->ids[iface->ids];
 	/* Release the lock we took in start() */
-	up_read(&ids->rw_mutex);
+	up_read(&ids->rwsem);
 }
 
 static int sysvipc_proc_show(struct seq_file *s, void *it)
diff --git a/ipc/util.h b/ipc/util.h
index 41a6c4d26399..0a362ffca972 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -94,10 +94,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
 #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER)
 #define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER)
 
-/* must be called with ids->rw_mutex acquired for writing */
+/* must be called with ids->rwsem acquired for writing */
 int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int);
 
-/* must be called with ids->rw_mutex acquired for reading */
+/* must be called with ids->rwsem acquired for reading */
 int ipc_get_maxid(struct ipc_ids *);
 
 /* must be called with both locks acquired. */

From 4718787d1f626f45ddb239912bc07266b9880044 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:25 -0700
Subject: [PATCH 297/303] ipc,msg: drop msg_unlock

There is only one user left, drop this function and just call
ipc_unlock_object() and rcu_read_unlock().

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/msg.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index 8203e71bcfbc..b0d541d42677 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -70,8 +70,6 @@ struct msg_sender {
 
 #define msg_ids(ns)	((ns)->ids[IPC_MSG_IDS])
 
-#define msg_unlock(msq)		ipc_unlock(&(msq)->q_perm)
-
 static void freeque(struct ipc_namespace *, struct kern_ipc_perm *);
 static int newque(struct ipc_namespace *, struct ipc_params *);
 #ifdef CONFIG_PROC_FS
@@ -270,7 +268,8 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 	expunge_all(msq, -EIDRM);
 	ss_wakeup(&msq->q_senders, 1);
 	msg_rmid(ns, msq);
-	msg_unlock(msq);
+	ipc_unlock_object(&msq->q_perm);
+	rcu_read_unlock();
 
 	list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
 		atomic_dec(&ns->msg_hdrs);

From 05603c44a7627793219b0bd9a7b236099dc9cd9d Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:26 -0700
Subject: [PATCH 298/303] ipc: document general ipc locking scheme

As suggested by Andrew, add a generic initial locking scheme used
throughout all sysv ipc mechanisms.  Documenting the ids rwsem, how rcu
can be enough to do the initial checks and when to actually acquire the
kern_ipc_perm.lock spinlock.

I found that adding it to util.c was generic enough.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/ipc/util.c b/ipc/util.c
index 9a1d779a20e2..1ddadcf9a2ab 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -15,6 +15,14 @@
  * Jun 2006 - namespaces ssupport
  *            OpenVZ, SWsoft Inc.
  *            Pavel Emelianov <xemul@openvz.org>
+ *
+ * General sysv ipc locking scheme:
+ *  when doing ipc id lookups, take the ids->rwsem
+ *      rcu_read_lock()
+ *          obtain the ipc object (kern_ipc_perm)
+ *          perform security, capabilities, auditing and permission checks, etc.
+ *          acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object()
+ *             perform data updates (ie: SET, RMID, LOCK/UNLOCK commands)
  */
 
 #include <linux/mm.h>

From 530fcd16d87cd2417c472a581ba5a1e501556c86 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:28 -0700
Subject: [PATCH 299/303] ipc, shm: guard against non-existant vma in shmdt(2)

When !CONFIG_MMU there's a chance we can derefence a NULL pointer when the
VM area isn't found - check the return value of find_vma().

Also, remove the redundant -EINVAL return: retval is set to the proper
return code and *only* changed to 0, when we actually unmap the segments.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index cb2cedaa8808..a0ed957cefc9 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1288,8 +1288,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
 #else /* CONFIG_MMU */
 	/* under NOMMU conditions, the exact address to be destroyed must be
 	 * given */
-	retval = -EINVAL;
-	if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
+	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
 		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
 		retval = 0;
 	}

From 32a2750010981216fb788c5190fb0e646abfab30 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:29 -0700
Subject: [PATCH 300/303] ipc: drop ipc_lock_by_ptr

After previous cleanups and optimizations, this function is no longer
heavily used and we don't have a good reason to keep it.  Update the few
remaining callers and get rid of it.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/namespace.c | 3 ++-
 ipc/util.c      | 6 ++++--
 ipc/util.h      | 6 ------
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/ipc/namespace.c b/ipc/namespace.c
index d43d9384bb2d..59451c1e214d 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -89,7 +89,8 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids,
 		perm = idr_find(&ids->ipcs_idr, next_id);
 		if (perm == NULL)
 			continue;
-		ipc_lock_by_ptr(perm);
+		rcu_read_lock();
+		ipc_lock_object(perm);
 		free(ns, perm);
 		total++;
 	}
diff --git a/ipc/util.c b/ipc/util.c
index 1ddadcf9a2ab..9f6aa30d2e0f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -205,7 +205,8 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
 			continue;
 		}
 
-		ipc_lock_by_ptr(ipc);
+		rcu_read_lock();
+		ipc_lock_object(ipc);
 		return ipc;
 	}
 
@@ -838,7 +839,8 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos,
 		ipc = idr_find(&ids->ipcs_idr, pos);
 		if (ipc != NULL) {
 			*new_pos = pos + 1;
-			ipc_lock_by_ptr(ipc);
+			rcu_read_lock();
+			ipc_lock_object(ipc);
 			return ipc;
 		}
 	}
diff --git a/ipc/util.h b/ipc/util.h
index 0a362ffca972..14b0a2adba08 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -171,12 +171,6 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm)
 	assert_spin_locked(&perm->lock);
 }
 
-static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm)
-{
-	rcu_read_lock();
-	ipc_lock_object(perm);
-}
-
 static inline void ipc_unlock(struct kern_ipc_perm *perm)
 {
 	ipc_unlock_object(perm);

From 7a25dd9e042b2b94202a67e5551112f4ac87285a Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:30 -0700
Subject: [PATCH 301/303] ipc, shm: drop shm_lock_check

This function was replaced by a the lockless shm_obtain_object_check(),
and no longer has any users.

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/shm.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/ipc/shm.c b/ipc/shm.c
index a0ed957cefc9..2821cdf93adb 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -167,17 +167,6 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp)
 	ipc_lock_object(&ipcp->shm_perm);
 }
 
-static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns,
-						int id)
-{
-	struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id);
-
-	if (IS_ERR(ipcp))
-		return (struct shmid_kernel *)ipcp;
-
-	return container_of(ipcp, struct shmid_kernel, shm_perm);
-}
-
 static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 {
 	ipc_rmid(&shm_ids(ns), &s->shm_perm);

From 20b8875abcf2daa1dda5cf70bd6369df5e85d4c1 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <davidlohr.bueso@hp.com>
Date: Wed, 11 Sep 2013 14:26:31 -0700
Subject: [PATCH 302/303] ipc: drop ipc_lock_check

No remaining users, we now use ipc_obtain_object_check().

Signed-off-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Sedat Dilek <sedat.dilek@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/util.c | 16 ----------------
 ipc/util.h |  1 -
 2 files changed, 17 deletions(-)

diff --git a/ipc/util.c b/ipc/util.c
index 9f6aa30d2e0f..e829da9ed01f 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -686,22 +686,6 @@ out:
 	return out;
 }
 
-struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id)
-{
-	struct kern_ipc_perm *out;
-
-	out = ipc_lock(ids, id);
-	if (IS_ERR(out))
-		return out;
-
-	if (ipc_checkid(out, id)) {
-		ipc_unlock(out);
-		return ERR_PTR(-EIDRM);
-	}
-
-	return out;
-}
-
 /**
  * ipcget - Common sys_*get() code
  * @ns : namsepace
diff --git a/ipc/util.h b/ipc/util.h
index 14b0a2adba08..c5f3338ba1fa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -177,7 +177,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm)
 	rcu_read_unlock();
 }
 
-struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id);
 struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id);
 int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
 			struct ipc_ops *ops, struct ipc_params *params);

From b34081f1cd59585451efaa69e1dff1b9507e6c89 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Wed, 11 Sep 2013 14:26:32 -0700
Subject: [PATCH 303/303] lz4: fix compression/decompression signedness
 mismatch

LZ4 compression and decompression functions require different in
signedness input/output parameters: unsigned char for compression and
signed char for decompression.

Change decompression API to require "(const) unsigned char *".

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Kyungsik Lee <kyungsik.lee@lge.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Yann Collet <yann.collet.73@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lz4.h      | 8 ++++----
 lib/lz4/lz4_decompress.c | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/lz4.h b/include/linux/lz4.h
index d21c13f10a64..4356686b0a39 100644
--- a/include/linux/lz4.h
+++ b/include/linux/lz4.h
@@ -67,8 +67,8 @@ int lz4hc_compress(const unsigned char *src, size_t src_len,
  *	note :  Destination buffer must be already allocated.
  *		slightly faster than lz4_decompress_unknownoutputsize()
  */
-int lz4_decompress(const char *src, size_t *src_len, char *dest,
-		size_t actual_dest_len);
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len);
 
 /*
  * lz4_decompress_unknownoutputsize()
@@ -82,6 +82,6 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest,
  *		  Error if return (< 0)
  *	note :  Destination buffer must be already allocated.
  */
-int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
-		char *dest, size_t *dest_len);
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len);
 #endif
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 411be80ddb46..df6839e3ce08 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -283,8 +283,8 @@ _output_error:
 	return (int) (-(((char *) ip) - source));
 }
 
-int lz4_decompress(const char *src, size_t *src_len, char *dest,
-		size_t actual_dest_len)
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len)
 {
 	int ret = -1;
 	int input_len = 0;
@@ -302,8 +302,8 @@ exit_0:
 EXPORT_SYMBOL(lz4_decompress);
 #endif
 
-int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
-		char *dest, size_t *dest_len)
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len)
 {
 	int ret = -1;
 	int out_len = 0;