From 3b8967d713d7426e9dd107d065208b84adface91 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:19:37 -0700 Subject: [PATCH 001/303] include/linux/smp.h:on_each_cpu(): switch back to a C function Revert commit c846ef7deba2 ("include/linux/smp.h:on_each_cpu(): switch back to a macro"). It turns out that the problematic linux/irqflags.h include was fixed within ia64 and mn10300. Cc: Geert Uytterhoeven Cc: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index c181399f2c20..c8488763277f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,6 +11,7 @@ #include #include #include +#include extern void cpu_idle(void); @@ -139,14 +140,17 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) } #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -#define on_each_cpu(func, info, wait) \ - ({ \ - unsigned long __flags; \ - local_irq_save(__flags); \ - func(info); \ - local_irq_restore(__flags); \ - 0; \ - }) + +static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} + /* * Note we still need to test the mask even for UP * because we actually can get an empty mask from From e79f525e99b04390ca4d2366309545a836c03bf1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:19:38 -0700 Subject: [PATCH 002/303] pidns: fix vfork() after unshare(CLONE_NEWPID) Commit 8382fcac1b81 ("pidns: Outlaw thread creation after unshare(CLONE_NEWPID)") nacks CLONE_VM if the forking process unshared pid_ns, this obviously breaks vfork: int main(void) { assert(unshare(CLONE_NEWUSER | CLONE_NEWPID) == 0); assert(vfork() >= 0); _exit(0); return 0; } fails without this patch. Change this check to use CLONE_SIGHAND instead. This also forbids CLONE_THREAD automatically, and this is what the comment implies. We could probably even drop CLONE_SIGHAND and use CLONE_THREAD, but it would be safer to not do this. The current check denies CLONE_SIGHAND implicitely and there is no reason to change this. Eric said "CLONE_SIGHAND is fine. CLONE_THREAD would be even better. Having shared signal handling between two different pid namespaces is the case that we are fundamentally guarding against." Signed-off-by: Oleg Nesterov Reported-by: Colin Walters Acked-by: Andy Lutomirski Reviewed-by: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index c9eaf2013002..3561391ca450 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,10 +1173,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); /* - * If the new process will be in a different pid namespace - * don't allow the creation of threads. + * If the new process will be in a different pid namespace don't + * allow it to share a thread group or signal handlers with the + * forking task. */ - if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && + if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) && (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); From 5167246a8ad617df55717c2d901da5e2aedffcfa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:19:40 -0700 Subject: [PATCH 003/303] pidns: kill the unnecessary CLONE_NEWPID in copy_process() Commit 8382fcac1b81 ("pidns: Outlaw thread creation after unshare(CLONE_NEWPID)") nacks CLONE_NEWPID if the forking process unshared pid_ns. This is correct but unnecessary, copy_pid_ns() does the same check. Remove the CLONE_NEWPID check to cleanup the code and prepare for the next change. Test-case: static int child(void *arg) { return 0; } static char stack[16 * 1024]; int main(void) { pid_t pid; assert(unshare(CLONE_NEWUSER | CLONE_NEWPID) == 0); pid = clone(child, stack + sizeof(stack) / 2, CLONE_NEWPID | SIGCHLD, NULL); assert(pid < 0 && errno == EINVAL); return 0; } clone(CLONE_NEWPID) correctly fails with or without this change. Signed-off-by: Oleg Nesterov Acked-by: Andy Lutomirski Cc: "Eric W. Biederman" Cc: Colin Walters Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 3561391ca450..68d508f2bfba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1177,9 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, * allow it to share a thread group or signal handlers with the * forking task. */ - if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) && - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + if ((clone_flags & CLONE_SIGHAND) && (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); From 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:19:41 -0700 Subject: [PATCH 004/303] fork: unify and tighten up CLONE_NEWUSER/CLONE_NEWPID checks do_fork() denies CLONE_THREAD | CLONE_PARENT if NEWUSER | NEWPID. Then later copy_process() denies CLONE_SIGHAND if the new process will be in a different pid namespace (task_active_pid_ns() doesn't match current->nsproxy->pid_ns). This looks confusing and inconsistent. CLONE_NEWPID is very similar to the case when ->pid_ns was already unshared, we want the same restrictions so copy_process() should also nack CLONE_PARENT. And it would be better to deny CLONE_NEWUSER && CLONE_SIGHAND as well just for consistency. Kill the "CLONE_NEWUSER | CLONE_NEWPID" check in do_fork() and change copy_process() to do the same check along with ->pid_ns check we already have. Signed-off-by: Oleg Nesterov Acked-by: Andy Lutomirski Cc: "Eric W. Biederman" Cc: Colin Walters Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 68d508f2bfba..84703db06cf3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,13 +1173,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); /* - * If the new process will be in a different pid namespace don't - * allow it to share a thread group or signal handlers with the - * forking task. + * If the new process will be in a different pid or user namespace + * do not allow it to share a thread group or signal handlers or + * parent with the forking task. */ - if ((clone_flags & CLONE_SIGHAND) && (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) - return ERR_PTR(-EINVAL); + if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || + (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } retval = security_task_create(clone_flags); if (retval) @@ -1575,15 +1578,6 @@ long do_fork(unsigned long clone_flags, int trace = 0; long nr; - /* - * Do some preliminary argument and permissions checking before we - * actually start allocating stuff - */ - if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { - if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) - return -EINVAL; - } - /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly From ffd29195ed720188789a6c7ada5e212b9c59e1af Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Wed, 11 Sep 2013 14:19:42 -0700 Subject: [PATCH 005/303] drivers/video/acornfb.c: remove dead code acornfb checks for HAS_VIDC while support for that macro was removed in v2.6.23 (when the arm26 port was removed). So we can remove a bit of dead code. Signed-off-by: Paul Bolle Cc: Florian Tobias Schandinat Cc: Laurent Pinchart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/acornfb.c | 266 +--------------------------------------- drivers/video/acornfb.h | 29 ----- 2 files changed, 1 insertion(+), 294 deletions(-) diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c index 6488a7351a60..7e8346ec9cdc 100644 --- a/drivers/video/acornfb.c +++ b/drivers/video/acornfb.c @@ -37,14 +37,6 @@ #include "acornfb.h" -/* - * VIDC machines can't do 16 or 32BPP modes. - */ -#ifdef HAS_VIDC -#undef FBCON_HAS_CFB16 -#undef FBCON_HAS_CFB32 -#endif - /* * Default resolution. * NOTE that it has to be supported in the table towards @@ -106,238 +98,6 @@ static struct vidc_timing current_vidc; extern unsigned int vram_size; /* set by setup.c */ -#ifdef HAS_VIDC - -#define MAX_SIZE 480*1024 - -/* CTL VIDC Actual - * 24.000 0 8.000 - * 25.175 0 8.392 - * 36.000 0 12.000 - * 24.000 1 12.000 - * 25.175 1 12.588 - * 24.000 2 16.000 - * 25.175 2 16.783 - * 36.000 1 18.000 - * 24.000 3 24.000 - * 36.000 2 24.000 - * 25.175 3 25.175 - * 36.000 3 36.000 - */ -struct pixclock { - u_long min_clock; - u_long max_clock; - u_int vidc_ctl; - u_int vid_ctl; -}; - -static struct pixclock arc_clocks[] = { - /* we allow +/-1% on these */ - { 123750, 126250, VIDC_CTRL_DIV3, VID_CTL_24MHz }, /* 8.000MHz */ - { 82500, 84167, VIDC_CTRL_DIV2, VID_CTL_24MHz }, /* 12.000MHz */ - { 61875, 63125, VIDC_CTRL_DIV1_5, VID_CTL_24MHz }, /* 16.000MHz */ - { 41250, 42083, VIDC_CTRL_DIV1, VID_CTL_24MHz }, /* 24.000MHz */ -}; - -static struct pixclock * -acornfb_valid_pixrate(struct fb_var_screeninfo *var) -{ - u_long pixclock = var->pixclock; - u_int i; - - if (!var->pixclock) - return NULL; - - for (i = 0; i < ARRAY_SIZE(arc_clocks); i++) - if (pixclock > arc_clocks[i].min_clock && - pixclock < arc_clocks[i].max_clock) - return arc_clocks + i; - - return NULL; -} - -/* VIDC Rules: - * hcr : must be even (interlace, hcr/2 must be even) - * hswr : must be even - * hdsr : must be odd - * hder : must be odd - * - * vcr : must be odd - * vswr : >= 1 - * vdsr : >= 1 - * vder : >= vdsr - * if interlaced, then hcr/2 must be even - */ -static void -acornfb_set_timing(struct fb_var_screeninfo *var) -{ - struct pixclock *pclk; - struct vidc_timing vidc; - u_int horiz_correction; - u_int sync_len, display_start, display_end, cycle; - u_int is_interlaced; - u_int vid_ctl, vidc_ctl; - u_int bandwidth; - - memset(&vidc, 0, sizeof(vidc)); - - pclk = acornfb_valid_pixrate(var); - vidc_ctl = pclk->vidc_ctl; - vid_ctl = pclk->vid_ctl; - - bandwidth = var->pixclock * 8 / var->bits_per_pixel; - /* 25.175, 4bpp = 79.444ns per byte, 317.776ns per word: fifo = 2,6 */ - if (bandwidth > 143500) - vidc_ctl |= VIDC_CTRL_FIFO_3_7; - else if (bandwidth > 71750) - vidc_ctl |= VIDC_CTRL_FIFO_2_6; - else if (bandwidth > 35875) - vidc_ctl |= VIDC_CTRL_FIFO_1_5; - else - vidc_ctl |= VIDC_CTRL_FIFO_0_4; - - switch (var->bits_per_pixel) { - case 1: - horiz_correction = 19; - vidc_ctl |= VIDC_CTRL_1BPP; - break; - - case 2: - horiz_correction = 11; - vidc_ctl |= VIDC_CTRL_2BPP; - break; - - case 4: - horiz_correction = 7; - vidc_ctl |= VIDC_CTRL_4BPP; - break; - - default: - case 8: - horiz_correction = 5; - vidc_ctl |= VIDC_CTRL_8BPP; - break; - } - - if (var->sync & FB_SYNC_COMP_HIGH_ACT) /* should be FB_SYNC_COMP */ - vidc_ctl |= VIDC_CTRL_CSYNC; - else { - if (!(var->sync & FB_SYNC_HOR_HIGH_ACT)) - vid_ctl |= VID_CTL_HS_NHSYNC; - - if (!(var->sync & FB_SYNC_VERT_HIGH_ACT)) - vid_ctl |= VID_CTL_VS_NVSYNC; - } - - sync_len = var->hsync_len; - display_start = sync_len + var->left_margin; - display_end = display_start + var->xres; - cycle = display_end + var->right_margin; - - /* if interlaced, then hcr/2 must be even */ - is_interlaced = (var->vmode & FB_VMODE_MASK) == FB_VMODE_INTERLACED; - - if (is_interlaced) { - vidc_ctl |= VIDC_CTRL_INTERLACE; - if (cycle & 2) { - cycle += 2; - var->right_margin += 2; - } - } - - vidc.h_cycle = (cycle - 2) / 2; - vidc.h_sync_width = (sync_len - 2) / 2; - vidc.h_border_start = (display_start - 1) / 2; - vidc.h_display_start = (display_start - horiz_correction) / 2; - vidc.h_display_end = (display_end - horiz_correction) / 2; - vidc.h_border_end = (display_end - 1) / 2; - vidc.h_interlace = (vidc.h_cycle + 1) / 2; - - sync_len = var->vsync_len; - display_start = sync_len + var->upper_margin; - display_end = display_start + var->yres; - cycle = display_end + var->lower_margin; - - if (is_interlaced) - cycle = (cycle - 3) / 2; - else - cycle = cycle - 1; - - vidc.v_cycle = cycle; - vidc.v_sync_width = sync_len - 1; - vidc.v_border_start = display_start - 1; - vidc.v_display_start = vidc.v_border_start; - vidc.v_display_end = display_end - 1; - vidc.v_border_end = vidc.v_display_end; - - if (machine_is_a5k()) - __raw_writeb(vid_ctl, IOEB_VID_CTL); - - if (memcmp(¤t_vidc, &vidc, sizeof(vidc))) { - current_vidc = vidc; - - vidc_writel(0xe0000000 | vidc_ctl); - vidc_writel(0x80000000 | (vidc.h_cycle << 14)); - vidc_writel(0x84000000 | (vidc.h_sync_width << 14)); - vidc_writel(0x88000000 | (vidc.h_border_start << 14)); - vidc_writel(0x8c000000 | (vidc.h_display_start << 14)); - vidc_writel(0x90000000 | (vidc.h_display_end << 14)); - vidc_writel(0x94000000 | (vidc.h_border_end << 14)); - vidc_writel(0x98000000); - vidc_writel(0x9c000000 | (vidc.h_interlace << 14)); - vidc_writel(0xa0000000 | (vidc.v_cycle << 14)); - vidc_writel(0xa4000000 | (vidc.v_sync_width << 14)); - vidc_writel(0xa8000000 | (vidc.v_border_start << 14)); - vidc_writel(0xac000000 | (vidc.v_display_start << 14)); - vidc_writel(0xb0000000 | (vidc.v_display_end << 14)); - vidc_writel(0xb4000000 | (vidc.v_border_end << 14)); - vidc_writel(0xb8000000); - vidc_writel(0xbc000000); - } -#ifdef DEBUG_MODE_SELECTION - printk(KERN_DEBUG "VIDC registers for %dx%dx%d:\n", var->xres, - var->yres, var->bits_per_pixel); - printk(KERN_DEBUG " H-cycle : %d\n", vidc.h_cycle); - printk(KERN_DEBUG " H-sync-width : %d\n", vidc.h_sync_width); - printk(KERN_DEBUG " H-border-start : %d\n", vidc.h_border_start); - printk(KERN_DEBUG " H-display-start : %d\n", vidc.h_display_start); - printk(KERN_DEBUG " H-display-end : %d\n", vidc.h_display_end); - printk(KERN_DEBUG " H-border-end : %d\n", vidc.h_border_end); - printk(KERN_DEBUG " H-interlace : %d\n", vidc.h_interlace); - printk(KERN_DEBUG " V-cycle : %d\n", vidc.v_cycle); - printk(KERN_DEBUG " V-sync-width : %d\n", vidc.v_sync_width); - printk(KERN_DEBUG " V-border-start : %d\n", vidc.v_border_start); - printk(KERN_DEBUG " V-display-start : %d\n", vidc.v_display_start); - printk(KERN_DEBUG " V-display-end : %d\n", vidc.v_display_end); - printk(KERN_DEBUG " V-border-end : %d\n", vidc.v_border_end); - printk(KERN_DEBUG " VIDC Ctrl (E) : 0x%08X\n", vidc_ctl); - printk(KERN_DEBUG " IOEB Ctrl : 0x%08X\n", vid_ctl); -#endif -} - -static int -acornfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, - u_int trans, struct fb_info *info) -{ - union palette pal; - - if (regno >= current_par.palette_size) - return 1; - - pal.p = 0; - pal.vidc.reg = regno; - pal.vidc.red = red >> 12; - pal.vidc.green = green >> 12; - pal.vidc.blue = blue >> 12; - - current_par.palette[regno] = pal; - - vidc_writel(pal.p); - - return 0; -} -#endif - #ifdef HAS_VIDC20 #include @@ -634,16 +394,7 @@ acornfb_adjust_timing(struct fb_info *info, struct fb_var_screeninfo *var, u_int /* hsync_len must be even */ var->hsync_len = (var->hsync_len + 1) & ~1; -#ifdef HAS_VIDC - /* left_margin must be odd */ - if ((var->left_margin & 1) == 0) { - var->left_margin -= 1; - var->right_margin += 1; - } - - /* right_margin must be odd */ - var->right_margin |= 1; -#elif defined(HAS_VIDC20) +#if defined(HAS_VIDC20) /* left_margin must be even */ if (var->left_margin & 1) { var->left_margin += 1; @@ -787,11 +538,7 @@ static int acornfb_set_par(struct fb_info *info) break; case 8: current_par.palette_size = VIDC_PALETTE_SIZE; -#ifdef HAS_VIDC - info->fix.visual = FB_VISUAL_STATIC_PSEUDOCOLOR; -#else info->fix.visual = FB_VISUAL_PSEUDOCOLOR; -#endif break; #ifdef HAS_VIDC20 case 16: @@ -971,9 +718,6 @@ static void acornfb_init_fbinfo(void) #if defined(HAS_VIDC20) fb_info.var.red.length = 8; fb_info.var.transp.length = 4; -#elif defined(HAS_VIDC) - fb_info.var.red.length = 4; - fb_info.var.transp.length = 1; #endif fb_info.var.green = fb_info.var.red; fb_info.var.blue = fb_info.var.red; @@ -1310,14 +1054,6 @@ static int acornfb_probe(struct platform_device *dev) fb_info.fix.smem_start = handle; } #endif -#if defined(HAS_VIDC) - /* - * Archimedes/A5000 machines use a fixed address for their - * framebuffers. Free unused pages - */ - free_unused_pages(PAGE_OFFSET + size, PAGE_OFFSET + MAX_SIZE); -#endif - fb_info.fix.smem_len = size; current_par.palette_size = VIDC_PALETTE_SIZE; diff --git a/drivers/video/acornfb.h b/drivers/video/acornfb.h index fb2a7fffe506..175c8ff3367c 100644 --- a/drivers/video/acornfb.h +++ b/drivers/video/acornfb.h @@ -13,10 +13,6 @@ #include #define VIDC_PALETTE_SIZE 256 #define VIDC_NAME "VIDC20" -#elif defined(HAS_VIDC) -#include -#define VIDC_PALETTE_SIZE 16 -#define VIDC_NAME "VIDC" #endif #define EXTEND8(x) ((x)|(x)<<8) @@ -101,31 +97,6 @@ struct modex_params { const struct modey_params *modey; }; -#ifdef HAS_VIDC - -#define VID_CTL_VS_NVSYNC (1 << 3) -#define VID_CTL_HS_NHSYNC (1 << 2) -#define VID_CTL_24MHz (0) -#define VID_CTL_25MHz (1) -#define VID_CTL_36MHz (2) - -#define VIDC_CTRL_CSYNC (1 << 7) -#define VIDC_CTRL_INTERLACE (1 << 6) -#define VIDC_CTRL_FIFO_0_4 (0 << 4) -#define VIDC_CTRL_FIFO_1_5 (1 << 4) -#define VIDC_CTRL_FIFO_2_6 (2 << 4) -#define VIDC_CTRL_FIFO_3_7 (3 << 4) -#define VIDC_CTRL_1BPP (0 << 2) -#define VIDC_CTRL_2BPP (1 << 2) -#define VIDC_CTRL_4BPP (2 << 2) -#define VIDC_CTRL_8BPP (3 << 2) -#define VIDC_CTRL_DIV3 (0 << 0) -#define VIDC_CTRL_DIV2 (1 << 0) -#define VIDC_CTRL_DIV1_5 (2 << 0) -#define VIDC_CTRL_DIV1 (3 << 0) - -#endif - #ifdef HAS_VIDC20 /* * VIDC20 registers From 5e42781caf6f5f1c77e842d6dcbbf5c51b8b2c29 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:19:43 -0700 Subject: [PATCH 006/303] drivers/iommu: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure. Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: David Brown Cc: Stephen Boyd Cc: Joerg Roedel Cc: Suman Anna Acked-by: Libo Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/iommu/msm_iommu_dev.c | 2 -- drivers/iommu/omap-iommu.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/drivers/iommu/msm_iommu_dev.c b/drivers/iommu/msm_iommu_dev.c index 0a1c9626aa9e..08ba4972da9d 100644 --- a/drivers/iommu/msm_iommu_dev.c +++ b/drivers/iommu/msm_iommu_dev.c @@ -282,7 +282,6 @@ static int msm_iommu_remove(struct platform_device *pdev) clk_put(drv->pclk); memset(drv, 0, sizeof(*drv)); kfree(drv); - platform_set_drvdata(pdev, NULL); } return 0; } @@ -366,7 +365,6 @@ static int msm_iommu_ctx_remove(struct platform_device *pdev) if (drv) { memset(drv, 0, sizeof(struct msm_iommu_ctx_drvdata)); kfree(drv); - platform_set_drvdata(pdev, NULL); } return 0; } diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c index 0ba3766240d5..bcd78a720630 100644 --- a/drivers/iommu/omap-iommu.c +++ b/drivers/iommu/omap-iommu.c @@ -1008,8 +1008,6 @@ static int omap_iommu_remove(struct platform_device *pdev) struct resource *res; struct omap_iommu *obj = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); - iopgtable_clear_entry_all(obj); irq = platform_get_irq(pdev, 0); From 2b1e55c389105b722cccadfa47f5615f57d8887f Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 11 Sep 2013 14:19:44 -0700 Subject: [PATCH 007/303] ocfs2: lighten up allocate transaction The issue scenario is as following: When fallocating a very large disk space for a small file, __ocfs2_extend_allocation attempts to get a very large transaction. For some journal sizes, there may be not enough room for this transaction, and the fallocate will fail. The patch below extends & restarts the transaction as necessary while allocating space, and should work with even the smallest journal. This patch refers ext4 resize. Test: # mkfs.ocfs2 -b 4K -C 32K -T datafiles /dev/sdc ...(jounral size is 32M) # mount.ocfs2 /dev/sdc /mnt/ocfs2/ # touch /mnt/ocfs2/1.log # fallocate -o 0 -l 400G /mnt/ocfs2/1.log fallocate: /mnt/ocfs2/1.log: fallocate failed: Cannot allocate memory # tail -f /var/log/messages [ 7372.278591] JBD: fallocate wants too many credits (2051 > 2048) [ 7372.278597] (fallocate,6438,0):__ocfs2_extend_allocation:709 ERROR: status = -12 [ 7372.278603] (fallocate,6438,0):ocfs2_allocate_unwritten_extents:1504 ERROR: status = -12 [ 7372.278607] (fallocate,6438,0):__ocfs2_change_file_space:1955 ERROR: status = -12 ^C With this patch, the test works well. Signed-off-by: Younger Liu Cc: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 6 +----- fs/ocfs2/journal.c | 35 +++++++++++++++++++++++++++++++++++ fs/ocfs2/journal.h | 11 +++++++++++ fs/ocfs2/ocfs2_trace.h | 2 ++ 4 files changed, 49 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3261d71319ee..409c549ae02a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -671,11 +671,7 @@ restarted_transaction: } else { BUG_ON(why != RESTART_TRANS); - /* TODO: This can be more intelligent. */ - credits = ocfs2_calc_extend_credits(osb->sb, - &fe->id2.i_list, - clusters_to_add); - status = ocfs2_extend_trans(handle, credits); + status = ocfs2_allocate_extend_trans(handle, 1); if (status < 0) { /* handle still has to be committed at * this point. */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 242170d83971..a126cb37ca4d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -455,6 +455,41 @@ bail: return status; } +/* + * If we have fewer than thresh credits, extend by OCFS2_MAX_TRANS_DATA. + * If that fails, restart the transaction & regain write access for the + * buffer head which is used for metadata modifications. + * Taken from Ext4: extend_or_restart_transaction() + */ +int ocfs2_allocate_extend_trans(handle_t *handle, int thresh) +{ + int status, old_nblks; + + BUG_ON(!handle); + + old_nblks = handle->h_buffer_credits; + trace_ocfs2_allocate_extend_trans(old_nblks, thresh); + + if (old_nblks < thresh) + return 0; + + status = jbd2_journal_extend(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + if (status > 0) { + status = jbd2_journal_restart(handle, OCFS2_MAX_TRANS_DATA); + if (status < 0) + mlog_errno(status); + } + +bail: + return status; +} + + struct ocfs2_triggers { struct jbd2_buffer_trigger_type ot_triggers; int ot_offset; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 0a992737dcaf..0b479bab3671 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -258,6 +258,17 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle); int ocfs2_extend_trans(handle_t *handle, int nblocks); +int ocfs2_allocate_extend_trans(handle_t *handle, + int thresh); + +/* + * Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * fallocate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. + */ +#define OCFS2_MAX_TRANS_DATA 64U /* * Create access is for when we get a newly created buffer and we're diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 3b481f490633..1b60c62aa9d6 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -2579,6 +2579,8 @@ DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans); DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart); +DEFINE_OCFS2_INT_INT_EVENT(ocfs2_allocate_extend_trans); + DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access); DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty); From f17c20dd2ec81e8ff328b81bc847da9429d0975b Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 11 Sep 2013 14:19:45 -0700 Subject: [PATCH 008/303] ocfs2: use i_size_read() to access i_size Though ocfs2 uses inode->i_mutex to protect i_size, there are both i_size_read/write() and direct accesses. Clean up all direct access to eliminate confusion. Signed-off-by: Junxiao Bi Cc: Jie Liu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 2 +- fs/ocfs2/extent_map.c | 10 +++++----- fs/ocfs2/ioctl.c | 2 +- fs/ocfs2/journal.c | 8 ++++---- fs/ocfs2/move_extents.c | 2 +- fs/ocfs2/quota_global.c | 6 +++--- fs/ocfs2/quota_local.c | 12 ++++++------ 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 94417a85ce6e..f37d3c0e2053 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2044,7 +2044,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, out_write_size: pos += copied; - if (pos > inode->i_size) { + if (pos > i_size_read(inode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 2487116d0d33..4bf2b763467f 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -852,20 +852,20 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) down_read(&OCFS2_I(inode)->ip_alloc_sem); - if (*offset >= inode->i_size) { + if (*offset >= i_size_read(inode)) { ret = -ENXIO; goto out_unlock; } if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { if (whence == SEEK_HOLE) - *offset = inode->i_size; + *offset = i_size_read(inode); goto out_unlock; } clen = 0; cpos = *offset >> cs_bits; - cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size); + cend = ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); while (cpos < cend && !is_last) { ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size, @@ -904,8 +904,8 @@ int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) extlen = clen; extlen <<= cs_bits; - if ((extoff + extlen) > inode->i_size) - extlen = inode->i_size - extoff; + if ((extoff + extlen) > i_size_read(inode)) + extlen = i_size_read(inode) - extoff; extoff += extlen; if (extoff > *offset) *offset = extoff; diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 0c60ef2d8056..fa32ce9b455d 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -303,7 +303,7 @@ int ocfs2_info_handle_journal_size(struct inode *inode, if (o2info_from_user(oij, req)) goto bail; - oij.ij_journal_size = osb->journal->j_inode->i_size; + oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a126cb37ca4d..44fc3e530c3d 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -836,14 +836,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) inode_lock = 1; di = (struct ocfs2_dinode *)bh->b_data; - if (inode->i_size < OCFS2_MIN_JOURNAL_SIZE) { + if (i_size_read(inode) < OCFS2_MIN_JOURNAL_SIZE) { mlog(ML_ERROR, "Journal file size (%lld) is too small!\n", - inode->i_size); + i_size_read(inode)); status = -EINVAL; goto done; } - trace_ocfs2_journal_init(inode->i_size, + trace_ocfs2_journal_init(i_size_read(inode), (unsigned long long)inode->i_blocks, OCFS2_I(inode)->ip_clusters); @@ -1131,7 +1131,7 @@ static int ocfs2_force_read_journal(struct inode *inode) memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL); - num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size); + num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); v_blkno = 0; while (v_blkno < num_blocks) { status = ocfs2_extent_map_get_blocks(inode, v_blkno, diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 452068b45749..415928536c5e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -845,7 +845,7 @@ static int __ocfs2_move_extents_range(struct buffer_head *di_bh, struct ocfs2_move_extents *range = context->range; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - if ((inode->i_size == 0) || (range->me_len == 0)) + if ((i_size_read(inode) == 0) || (range->me_len == 0)) return 0; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 332a281f217e..aaa50611ec66 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -234,7 +234,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; } - if (gqinode->i_size < off + len) { + if (i_size_read(gqinode) < off + len) { loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); @@ -778,8 +778,8 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) */ WARN_ON(journal_current_handle()); status = ocfs2_extend_no_holes(gqinode, NULL, - gqinode->i_size + (need_alloc << sb->s_blocksize_bits), - gqinode->i_size); + i_size_read(gqinode) + (need_alloc << sb->s_blocksize_bits), + i_size_read(gqinode)); if (status < 0) goto out_dq; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 27fe7ee4874c..2e4344be3b96 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -982,14 +982,14 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, - lqinode->i_size + 2 * sb->s_blocksize, - lqinode->i_size); + i_size_read(lqinode) + 2 * sb->s_blocksize, + i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, - lqinode->i_size + 2 * sb->s_blocksize); + i_size_read(lqinode) + 2 * sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; @@ -1125,14 +1125,14 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( /* We are protected by dqio_sem so no locking needed */ status = ocfs2_extend_no_holes(lqinode, NULL, - lqinode->i_size + sb->s_blocksize, - lqinode->i_size); + i_size_read(lqinode) + sb->s_blocksize, + i_size_read(lqinode)); if (status < 0) { mlog_errno(status); goto out; } status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, - lqinode->i_size + sb->s_blocksize); + i_size_read(lqinode) + sb->s_blocksize); if (status < 0) { mlog_errno(status); goto out; From 98ac9125c5afed8c5d2e4c5824988f8ad51814e1 Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 11 Sep 2013 14:19:46 -0700 Subject: [PATCH 009/303] ocfs2: dlm_request_all_locks() should deal with the status sent from target node dlm_request_all_locks() should deal with the status sent from target node if DLM_LOCK_REQUEST_MSG is sent successfully, or recovery master will fall into endless loop, waiting for other nodes to send locks and DLM_RECO_DATA_DONE_MSG to me. NodeA NodeB selected as recovery master dlm_remaster_locks() ->dlm_request_all_locks() send DLM_LOCK_REQUEST_MSG to nodeA It happened that NodeA cannot alloc memory when it processes this message. dlm_request_all_locks_handler() do not queue dlm_request_all_locks_worker and returns -ENOMEM. It will never send locks and DLM_RECO_DATA_DONE_MSG to NodeB. NodeB do not deal with the status sent from nodeA, and will fall in endless loop waiting for the recovery state of NodeA to be changed. Signed-off-by: joyce Cc: Mark Fasheh Cc: Jeff Liu Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 773bd32bfd8c..f94550218152 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -787,6 +787,7 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, { struct dlm_lock_request lr; int ret; + int status; mlog(0, "\n"); @@ -800,13 +801,15 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from, // send message ret = o2net_send_message(DLM_LOCK_REQUEST_MSG, dlm->key, - &lr, sizeof(lr), request_from, NULL); + &lr, sizeof(lr), request_from, &status); /* negative status is handled by caller */ if (ret < 0) mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u " "to recover dead node %u\n", dlm->name, ret, request_from, dead_node); + else + ret = status; // return from here, then // sleep until all received or error return ret; From 7e9b79370733945b25c24e09d663b07c3936d10c Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 11 Sep 2013 14:19:47 -0700 Subject: [PATCH 010/303] ocfs2: ac_bits_wanted should be local_alloc_bits when returns -ENOSPC There is an issue in reserving and claiming space for localalloc, When localalloc space is not enough, it would claim space from global_bitmap. And if there is not enough free space in global_bitmap, the size of claiming space would set to half of orignal size and retry. The issue is as follows: osb->local_alloc_bits is set to half of orignal size in ocfs2_recalc_la_window(), but ac->ac_bits_wanted is set to osb->local_alloc_default_bits which is not changed. localalloc always reserves and claims local_alloc_default_bits space and returns ENOSPC. So, ac->ac_bits_wanted should be osb->local_alloc_bits which would be changed. Signed-off-by: Younger Liu Cc: Joel Becker Cc: Mark Fasheh Cc: Jeff Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/localalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index aebeacd807c3..cd5496b7a0a3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -1082,7 +1082,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, } retry_enospc: - (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); if (status == -ENOSPC) { if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == @@ -1154,7 +1154,7 @@ retry_enospc: OCFS2_LA_DISABLED) goto bail; - ac->ac_bits_wanted = osb->local_alloc_default_bits; + ac->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_claim_clusters(handle, ac, osb->local_alloc_bits, &cluster_off, From 8dd7903e48df3779bc424196c22dc73b66d0643e Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Wed, 11 Sep 2013 14:19:49 -0700 Subject: [PATCH 011/303] fs/ocfs2/cluster/tcp.c: fix possible null pointer dereferences Fix some possible null pointer dereferences that were detected by the static code analyser, smatch. Signed-off-by: Sunil Mushran Reported-by: Dan Carpenter Reported-by: Guozhonghua Cc: Sunil Mushran Cc: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d644dc611425..d04a3c2fad3c 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -543,8 +543,9 @@ static void o2net_set_nn_state(struct o2net_node *nn, } if (was_valid && !valid) { - printk(KERN_NOTICE "o2net: No longer connected to " - SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); + if (old_sc) + printk(KERN_NOTICE "o2net: No longer connected to " + SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); o2net_complete_nodes_nsw(nn); } @@ -1695,13 +1696,12 @@ static void o2net_start_connect(struct work_struct *work) ret = 0; out: - if (ret) { + if (ret && sc) { printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT " failed with errno %d\n", SC_NODEF_ARGS(sc), ret); /* 0 err so that another will be queued and attempted * from set_nn_state */ - if (sc) - o2net_ensure_shutdown(nn, sc, 0); + o2net_ensure_shutdown(nn, sc, 0); } if (sc) sc_put(sc); From df53cd3b70712cd136f10ef79457623c5c3764a4 Mon Sep 17 00:00:00 2001 From: Dong Fang Date: Wed, 11 Sep 2013 14:19:50 -0700 Subject: [PATCH 012/303] ocfs2: use list_for_each_entry() instead of list_for_each() [dan.carpenter@oracle.com: fix up some NULL dereference bugs] Signed-off-by: Dong Fang Cc: Mark Fasheh Cc: Joel Becker Cc: Jeff Liu Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 14 +++++--------- fs/ocfs2/dlm/dlmast.c | 8 +++----- fs/ocfs2/dlm/dlmcommon.h | 4 +--- fs/ocfs2/dlm/dlmconvert.c | 18 +++++++----------- fs/ocfs2/dlm/dlmdebug.c | 15 ++++----------- fs/ocfs2/dlm/dlmdomain.c | 35 ++++++++++++----------------------- fs/ocfs2/dlm/dlmlock.c | 9 ++------- fs/ocfs2/dlm/dlmmaster.c | 18 +++++------------- fs/ocfs2/dlm/dlmthread.c | 19 +++++-------------- fs/ocfs2/dlm/dlmunlock.c | 4 +--- 10 files changed, 45 insertions(+), 99 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 5c1c864e81cc..25b72e82b8fa 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -628,11 +628,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, struct o2nm_node *node, int idx) { - struct list_head *iter; struct o2hb_callback_func *f; - list_for_each(iter, &hbcall->list) { - f = list_entry(iter, struct o2hb_callback_func, hc_item); + list_for_each_entry(f, &hbcall->list, hc_item) { mlog(ML_HEARTBEAT, "calling funcs %p\n", f); (f->hc_func)(node, idx, f->hc_data); } @@ -2516,8 +2514,7 @@ unlock: int o2hb_register_callback(const char *region_uuid, struct o2hb_callback_func *hc) { - struct o2hb_callback_func *tmp; - struct list_head *iter; + struct o2hb_callback_func *f; struct o2hb_callback *hbcall; int ret; @@ -2540,10 +2537,9 @@ int o2hb_register_callback(const char *region_uuid, down_write(&o2hb_callback_sem); - list_for_each(iter, &hbcall->list) { - tmp = list_entry(iter, struct o2hb_callback_func, hc_item); - if (hc->hc_priority < tmp->hc_priority) { - list_add_tail(&hc->hc_item, iter); + list_for_each_entry(f, &hbcall->list, hc_item) { + if (hc->hc_priority < f->hc_priority) { + list_add_tail(&hc->hc_item, &f->hc_item); break; } } diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index fbec0be62326..b46278f9ae44 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -292,7 +292,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_lock *lock = NULL; struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf; char *name; - struct list_head *iter, *head=NULL; + struct list_head *head = NULL; __be64 cookie; u32 flags; u8 node; @@ -373,8 +373,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, /* try convert queue for both ast/bast */ head = &res->converting; lock = NULL; - list_for_each(iter, head) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } @@ -385,8 +384,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, else head = &res->granted; - list_for_each(iter, head) { - lock = list_entry (iter, struct dlm_lock, list); + list_for_each_entry(lock, head, list) { if (lock->ml.cookie == cookie) goto do_ast; } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index de854cca12a2..e0517762fcc0 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1079,11 +1079,9 @@ static inline int dlm_lock_compatible(int existing, int request) static inline int dlm_lock_on_list(struct list_head *head, struct dlm_lock *lock) { - struct list_head *iter; struct dlm_lock *tmplock; - list_for_each(iter, head) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, head, list) { if (tmplock == lock) return 1; } diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 29a886d1e82c..e36d63ff1783 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -123,7 +123,6 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, int *kick_thread) { enum dlm_status status = DLM_NORMAL; - struct list_head *iter; struct dlm_lock *tmplock=NULL; assert_spin_locked(&res->spinlock); @@ -185,16 +184,14 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm, /* upconvert from here on */ status = DLM_NORMAL; - list_for_each(iter, &res->granted) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, &res->granted, list) { if (tmplock == lock) continue; if (!dlm_lock_compatible(tmplock->ml.type, type)) goto switch_queues; } - list_for_each(iter, &res->converting) { - tmplock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(tmplock, &res->converting, list) { if (!dlm_lock_compatible(tmplock->ml.type, type)) goto switch_queues; /* existing conversion requests take precedence */ @@ -424,8 +421,8 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_convert_lock *cnv = (struct dlm_convert_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct list_head *iter; struct dlm_lock *lock = NULL; + struct dlm_lock *tmp_lock; struct dlm_lockstatus *lksb; enum dlm_status status = DLM_NORMAL; u32 flags; @@ -471,14 +468,13 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data, dlm_error(status); goto leave; } - list_for_each(iter, &res->granted) { - lock = list_entry(iter, struct dlm_lock, list); - if (lock->ml.cookie == cnv->cookie && - lock->ml.node == cnv->node_idx) { + list_for_each_entry(tmp_lock, &res->granted, list) { + if (tmp_lock->ml.cookie == cnv->cookie && + tmp_lock->ml.node == cnv->node_idx) { + lock = tmp_lock; dlm_lock_get(lock); break; } - lock = NULL; } spin_unlock(&res->spinlock); if (!lock) { diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 0e28e242226d..e33cd7a3c582 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -96,7 +96,6 @@ static void __dlm_print_lock(struct dlm_lock *lock) void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) { - struct list_head *iter2; struct dlm_lock *lock; char buf[DLM_LOCKID_NAME_MAX]; @@ -118,18 +117,15 @@ void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) res->inflight_locks, atomic_read(&res->asts_reserved)); dlm_print_lockres_refmap(res); printk(" granted queue:\n"); - list_for_each(iter2, &res->granted) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { __dlm_print_lock(lock); } printk(" converting queue:\n"); - list_for_each(iter2, &res->converting) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->converting, list) { __dlm_print_lock(lock); } printk(" blocked queue:\n"); - list_for_each(iter2, &res->blocked) { - lock = list_entry(iter2, struct dlm_lock, list); + list_for_each_entry(lock, &res->blocked, list) { __dlm_print_lock(lock); } } @@ -446,7 +442,6 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) { struct dlm_master_list_entry *mle; struct hlist_head *bucket; - struct hlist_node *list; int i, out = 0; unsigned long total = 0, longest = 0, bucket_count = 0; @@ -456,9 +451,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len) spin_lock(&dlm->master_lock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each(list, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry(mle, bucket, master_hash_node) { ++total; ++bucket_count; if (len - out < 200) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index dbb17c07656a..8b3382abf840 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -193,7 +193,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, unsigned int hash) { struct hlist_head *bucket; - struct hlist_node *list; + struct dlm_lock_resource *res; mlog(0, "%.*s\n", len, name); @@ -201,9 +201,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, bucket = dlm_lockres_hash(dlm, hash); - hlist_for_each(list, bucket) { - struct dlm_lock_resource *res = hlist_entry(list, - struct dlm_lock_resource, hash_node); + hlist_for_each_entry(res, bucket, hash_node) { if (res->lockname.name[0] != name[0]) continue; if (unlikely(res->lockname.len != len)) @@ -262,22 +260,19 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, static struct dlm_ctxt * __dlm_lookup_domain_full(const char *domain, int len) { - struct dlm_ctxt *tmp = NULL; - struct list_head *iter; + struct dlm_ctxt *tmp; assert_spin_locked(&dlm_domain_lock); /* tmp->name here is always NULL terminated, * but domain may not be! */ - list_for_each(iter, &dlm_domains) { - tmp = list_entry (iter, struct dlm_ctxt, list); + list_for_each_entry(tmp, &dlm_domains, list) { if (strlen(tmp->name) == len && memcmp(tmp->name, domain, len)==0) - break; - tmp = NULL; + return tmp; } - return tmp; + return NULL; } /* For null terminated domain strings ONLY */ @@ -366,25 +361,22 @@ static void __dlm_get(struct dlm_ctxt *dlm) * you shouldn't trust your pointer. */ struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm) { - struct list_head *iter; - struct dlm_ctxt *target = NULL; + struct dlm_ctxt *target; + struct dlm_ctxt *ret = NULL; spin_lock(&dlm_domain_lock); - list_for_each(iter, &dlm_domains) { - target = list_entry (iter, struct dlm_ctxt, list); - + list_for_each_entry(target, &dlm_domains, list) { if (target == dlm) { __dlm_get(target); + ret = target; break; } - - target = NULL; } spin_unlock(&dlm_domain_lock); - return target; + return ret; } int dlm_domain_fully_joined(struct dlm_ctxt *dlm) @@ -2296,13 +2288,10 @@ static DECLARE_RWSEM(dlm_callback_sem); void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm, int node_num) { - struct list_head *iter; struct dlm_eviction_cb *cb; down_read(&dlm_callback_sem); - list_for_each(iter, &dlm->dlm_eviction_callbacks) { - cb = list_entry(iter, struct dlm_eviction_cb, ec_item); - + list_for_each_entry(cb, &dlm->dlm_eviction_callbacks, ec_item) { cb->ec_func(node_num, cb->ec_data); } up_read(&dlm_callback_sem); diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 47e67c2d228f..5d32f7511f74 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -91,19 +91,14 @@ void dlm_destroy_lock_cache(void) static int dlm_can_grant_new_lock(struct dlm_lock_resource *res, struct dlm_lock *lock) { - struct list_head *iter; struct dlm_lock *tmplock; - list_for_each(iter, &res->granted) { - tmplock = list_entry(iter, struct dlm_lock, list); - + list_for_each_entry(tmplock, &res->granted, list) { if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; } - list_for_each(iter, &res->converting) { - tmplock = list_entry(iter, struct dlm_lock, list); - + list_for_each_entry(tmplock, &res->converting, list) { if (!dlm_lock_compatible(tmplock->ml.type, lock->ml.type)) return 0; if (!dlm_lock_compatible(tmplock->ml.convert_type, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 33ecbe0e6734..cf0f103963b1 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -342,16 +342,13 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, { struct dlm_master_list_entry *tmpmle; struct hlist_head *bucket; - struct hlist_node *list; unsigned int hash; assert_spin_locked(&dlm->master_lock); hash = dlm_lockid_hash(name, namelen); bucket = dlm_master_hash(dlm, hash); - hlist_for_each(list, bucket) { - tmpmle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry(tmpmle, bucket, master_hash_node) { if (!dlm_mle_equal(dlm, tmpmle, name, namelen)) continue; dlm_get_mle(tmpmle); @@ -3183,7 +3180,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; struct hlist_head *bucket; - struct hlist_node *list; + struct hlist_node *tmp; unsigned int i; mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node); @@ -3194,10 +3191,7 @@ top: spin_lock(&dlm->master_lock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each(list, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); - + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { BUG_ON(mle->type != DLM_MLE_BLOCK && mle->type != DLM_MLE_MASTER && mle->type != DLM_MLE_MIGRATION); @@ -3378,7 +3372,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) int i; struct hlist_head *bucket; struct dlm_master_list_entry *mle; - struct hlist_node *tmp, *list; + struct hlist_node *tmp; /* * We notified all other nodes that we are exiting the domain and @@ -3394,9 +3388,7 @@ void dlm_force_free_mles(struct dlm_ctxt *dlm) for (i = 0; i < DLM_HASH_BUCKETS; i++) { bucket = dlm_master_hash(dlm, i); - hlist_for_each_safe(list, tmp, bucket) { - mle = hlist_entry(list, struct dlm_master_list_entry, - master_hash_node); + hlist_for_each_entry_safe(mle, tmp, bucket, master_hash_node) { if (mle->type != DLM_MLE_BLOCK) { mlog(ML_ERROR, "bad mle: %p\n", mle); dlm_print_one_mle(mle); diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index e73c833fc2a1..9db869de829d 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -286,8 +286,6 @@ static void dlm_shuffle_lists(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { struct dlm_lock *lock, *target; - struct list_head *iter; - struct list_head *head; int can_grant = 1; /* @@ -314,9 +312,7 @@ converting: dlm->name, res->lockname.len, res->lockname.name); BUG(); } - head = &res->granted; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, @@ -333,9 +329,8 @@ converting: target->ml.convert_type; } } - head = &res->converting; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + + list_for_each_entry(lock, &res->converting, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, @@ -384,9 +379,7 @@ blocked: goto leave; target = list_entry(res->blocked.next, struct dlm_lock, list); - head = &res->granted; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->granted, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { @@ -400,9 +393,7 @@ blocked: } } - head = &res->converting; - list_for_each(iter, head) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, &res->converting, list) { if (lock==target) continue; if (!dlm_lock_compatible(lock->ml.type, target->ml.type)) { diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 850aa7e87537..5698b52cf5c9 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -388,7 +388,6 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, struct dlm_ctxt *dlm = data; struct dlm_unlock_lock *unlock = (struct dlm_unlock_lock *)msg->buf; struct dlm_lock_resource *res = NULL; - struct list_head *iter; struct dlm_lock *lock = NULL; enum dlm_status status = DLM_NORMAL; int found = 0, i; @@ -458,8 +457,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } for (i=0; i<3; i++) { - list_for_each(iter, queue) { - lock = list_entry(iter, struct dlm_lock, list); + list_for_each_entry(lock, queue, list) { if (lock->ml.cookie == unlock->cookie && lock->ml.node == unlock->node_idx) { dlm_lock_get(lock); From 3d94ea51c1d8db6f41268a9d2aea5f5771e9a8d3 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:19:51 -0700 Subject: [PATCH 013/303] ocfs2: clean up dead code in ocfs2_acl_from_xattr() In ocfs2_acl_from_xattr(), if size is less than sizeof(struct posix_acl_entry), it returns ERR_PTR(-EINVAL) directly. Then assign (size / sizeof(struct posix_acl_entry)) to count which will be at least 1, that means the following branch (count < 0) and (count == 0) will never be true. Signed-off-by: Joseph Qi Cc: Mark Fasheh Acked-by: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 8a404576fb26..b4f788e0ca31 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -51,10 +51,6 @@ static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) return ERR_PTR(-EINVAL); count = size / sizeof(struct posix_acl_entry); - if (count < 0) - return ERR_PTR(-EINVAL); - if (count == 0) - return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) From 2b0f6eae2dd2f7f21dbf93241938a687f6757dea Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:19:52 -0700 Subject: [PATCH 014/303] ocfs2: add missing return value check of ocfs2_get_clusters() In ocfs2_attach_refcount_tree() and ocfs2_duplicate_extent_list(), if error occurs when calling ocfs2_get_clusters(), it will go with unexpected behavior as local variables p_cluster, num_clusters and ext_flags are declared without initialization. Signed-off-by: Joseph Qi Reviewed-by: Jie Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/refcounttree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index a70d604593b6..bf4dfc14bb2c 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3854,7 +3854,10 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, while (cpos < clusters) { ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters, &ext_flags); - + if (ret) { + mlog_errno(ret); + goto unlock; + } if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) { ret = ocfs2_add_refcount_flag(inode, &di_et, &ref_tree->rf_ci, @@ -4025,7 +4028,10 @@ static int ocfs2_duplicate_extent_list(struct inode *s_inode, while (cpos < clusters) { ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster, &num_clusters, &ext_flags); - + if (ret) { + mlog_errno(ret); + goto out; + } if (p_cluster) { ret = ocfs2_add_refcounted_extent(t_inode, &et, ref_ci, ref_root_bh, From 4704aa30fc35010dd9c3ce1d9d2e77af09c2c081 Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 11 Sep 2013 14:19:53 -0700 Subject: [PATCH 015/303] ocfs2: fix a memory leak in __ocfs2_move_extents() The ocfs2 path is not properly freed which leads to a memory leak at __ocfs2_move_extents(). This patch stops the leaks of the ocfs2_path structure. Signed-off-by: Jie Liu Reviewed-by: Younger Liu Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/move_extents.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 415928536c5e..3d3f3c83065c 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -152,6 +152,7 @@ static int __ocfs2_move_extent(handle_t *handle, } out: + ocfs2_free_path(path); return ret; } From 17caf9555edc27a0c6df512de0879b357ebacae4 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:19:55 -0700 Subject: [PATCH 016/303] ocfs2: add the missing return value check of ocfs2_xattr_get_clusters In ocfs2_xattr_value_attach_refcount(), if error occurs when calling ocfs2_xattr_get_clusters(), it will go with unexpected behavior since local variables p_cluster, num_clusters and ext_flags are declared without initialization. Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Acked-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 317ef0abccbb..1cbc2231a9f2 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -5881,6 +5881,10 @@ static int ocfs2_xattr_value_attach_refcount(struct inode *inode, while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, el, &ext_flags); + if (ret) { + mlog_errno(ret); + break; + } cpos += num_clusters; if ((ext_flags & OCFS2_EXT_REFCOUNTED)) From 6ea437a3639b15e312f81819bb20f737ff596194 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 11 Sep 2013 14:19:56 -0700 Subject: [PATCH 017/303] ocfs2: free meta_ac and data_ac when ocfs2_start_trans fails in ocfs2_xattr_set() In ocfs2_xattr_set(), if ocfs2_start_trans failed, meta_ac and data_ac should be free. Otherwise, It would lead to a memory leak. Signed-off-by: Younger Liu Cc: Joseph Qi Reviewed-by: Jie Liu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 1cbc2231a9f2..18330f5b57be 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3505,7 +3505,7 @@ int ocfs2_xattr_set(struct inode *inode, int ret, credits, ref_meta = 0, ref_credits = 0; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *tl_inode = osb->osb_tl_inode; - struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, NULL, }; struct ocfs2_refcount_tree *ref_tree = NULL; struct ocfs2_xattr_info xi = { @@ -3609,13 +3609,14 @@ int ocfs2_xattr_set(struct inode *inode, if (IS_ERR(ctxt.handle)) { ret = PTR_ERR(ctxt.handle); mlog_errno(ret); - goto cleanup; + goto out_free_ac; } ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); ocfs2_commit_trans(osb, ctxt.handle); +out_free_ac: if (ctxt.data_ac) ocfs2_free_alloc_context(ctxt.data_ac); if (ctxt.meta_ac) From 69b2bd16d9792085d57865fcaac55753803a4f5d Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 11 Sep 2013 14:19:57 -0700 Subject: [PATCH 018/303] ocfs2/dlm: force clean refmap when doing local cleanup dlm_do_local_recovery_cleanup() should force clean refmap if the owner of lockres is UNKNOWN. Otherwise node may hang when umounting filesystems. Here's the situation: Node1 Node2 dlmlock() -> dlm_get_lock_resource() send DLM_MASTER_REQUEST_MSG to other nodes. trying to master this lockres, return MAYBE. selected as the master of lockresA, set mle->master to Node1, and do assert_master, send DLM_ASSERT_MASTER_MSG to Node2. Node 2 has interest on lockresA and return DLM_ASSERT_RESPONSE_MASTERY_REF then something happened and Node2 crashed. Receiving DLM_ASSERT_RESPONSE_MASTERY_REF, set Node2 into refmap, and keep sending DLM_ASSERT_MASTER_MSG to other nodes o2hb found node2 down, calling dlm_hb_node_down() --> dlm_do_local_recovery_cleanup() the master of lockresA is still UNKNOWN, no need to call dlm_free_dead_locks(). Set the master of lockresA to Node1, but Node2 stills remains in refmap. When Node1 umount, it found that the refmap of lockresA is not empty and attempted to migrate it to Node2, But Node2 is already down, so umount hang, trying to migrate lockresA again and again. Signed-off-by: joyce Cc: Mark Fasheh Cc: Joel Becker Cc: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index f94550218152..0b5adca1b178 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2331,6 +2331,14 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); + } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + if (test_bit(dead_node, res->refmap)) { + mlog(0, "%s:%.*s: dead node %u had a ref, but had " + "no locks and had not purged before dying\n", + dlm->name, res->lockname.len, + res->lockname.name, dead_node); + dlm_lockres_clear_refmap_bit(dlm, res, dead_node); + } } spin_unlock(&res->spinlock); } From 6cae6d3189ef34647bca9b9b1d240ebd760e5dea Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:19:58 -0700 Subject: [PATCH 019/303] ocfs2: fix possible double free in ocfs2_reflink_xattr_rec In ocfs2_reflink_xattr_rec(), meta_ac and data_ac are allocated by calling ocfs2_lock_reflink_xattr_rec_allocators(). Once an error occurs when allocating *data_ac, it frees *meta_ac which is allocated before. Here it mistakenly sets meta_ac to NULL but *meta_ac. Then ocfs2_reflink_xattr_rec() will try to free meta_ac again which is already invalid. Signed-off-by: Joseph Qi Reviewed-by: Jie Liu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 18330f5b57be..6ce0686eab72 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6802,7 +6802,7 @@ out: if (ret) { if (*meta_ac) { ocfs2_free_alloc_context(*meta_ac); - meta_ac = NULL; + *meta_ac = NULL; } } From 7aebff18b91ebdefe15bb7d3f5d711df8312a7fb Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Wed, 11 Sep 2013 14:19:59 -0700 Subject: [PATCH 020/303] ocfs2: free path in ocfs2_remove_inode_range() In ocfs2_remove_inode_range(), there is a memory leak. The variable path has allocated memory with ocfs2_new_path_from_et(), but it is not free. Signed-off-by: Younger Liu Reviewed-by: Jie Liu Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 409c549ae02a..4f8197caa487 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1796,6 +1796,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); out: + ocfs2_free_path(path); ocfs2_schedule_truncate_log_flush(osb, 1); ocfs2_run_deallocs(osb, &dealloc); From 9a239e4c68df78888f67b1d4e7d507e24ac6764f Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:20:00 -0700 Subject: [PATCH 021/303] ocfs2: adjust code style for o2net_handler_tree_lookup() Code in o2net_handler_tree_lookup() may be corrupted by mistake. So adjust it to promote readability. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d04a3c2fad3c..8c50c238577a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -766,32 +766,32 @@ static struct o2net_msg_handler * o2net_handler_tree_lookup(u32 msg_type, u32 key, struct rb_node ***ret_p, struct rb_node **ret_parent) { - struct rb_node **p = &o2net_handler_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p = &o2net_handler_tree.rb_node; + struct rb_node *parent = NULL; struct o2net_msg_handler *nmh, *ret = NULL; int cmp; - while (*p) { - parent = *p; - nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); + while (*p) { + parent = *p; + nmh = rb_entry(parent, struct o2net_msg_handler, nh_node); cmp = o2net_handler_cmp(nmh, msg_type, key); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else { + if (cmp < 0) + p = &(*p)->rb_left; + else if (cmp > 0) + p = &(*p)->rb_right; + else { ret = nmh; - break; + break; } - } + } - if (ret_p != NULL) - *ret_p = p; - if (ret_parent != NULL) - *ret_parent = parent; + if (ret_p != NULL) + *ret_p = p; + if (ret_parent != NULL) + *ret_parent = parent; - return ret; + return ret; } static void o2net_handler_kref_release(struct kref *kref) From 03dbe88aa9cd0d7b0a876b38bd75ce73b4522454 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:20:01 -0700 Subject: [PATCH 022/303] ocfs2: avoid possible NULL pointer dereference in o2net_accept_one() Since o2nm_get_node_by_num() may return NULL, we add this check in o2net_accept_one() to avoid possible NULL pointer dereference. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 8c50c238577a..2cd2406b4140 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1873,12 +1873,16 @@ static int o2net_accept_one(struct socket *sock) if (o2nm_this_node() >= node->nd_num) { local_node = o2nm_get_node_by_num(o2nm_this_node()); - printk(KERN_NOTICE "o2net: Unexpected connect attempt seen " - "at node '%s' (%u, %pI4:%d) from node '%s' (%u, " - "%pI4:%d)\n", local_node->nd_name, local_node->nd_num, - &(local_node->nd_ipv4_address), - ntohs(local_node->nd_ipv4_port), node->nd_name, - node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + if (local_node) + printk(KERN_NOTICE "o2net: Unexpected connect attempt " + "seen at node '%s' (%u, %pI4:%d) from " + "node '%s' (%u, %pI4:%d)\n", + local_node->nd_name, local_node->nd_num, + &(local_node->nd_ipv4_address), + ntohs(local_node->nd_ipv4_port), + node->nd_name, + node->nd_num, &sin.sin_addr.s_addr, + ntohs(sin.sin_port)); ret = -EINVAL; goto out; } From 6f8648e894498f769832b79399b1cfabd2973ea9 Mon Sep 17 00:00:00 2001 From: Joyce Date: Wed, 11 Sep 2013 14:20:03 -0700 Subject: [PATCH 023/303] ocfs2: fix a tiny race case when firing callbacks In o2hb_shutdown_slot() and o2hb_check_slot(), since event is defined as local, it is only valid during the call stack. So the following tiny race case may happen in a multi-volumes mounted environment: o2hb-vol1 o2hb-vol2 1) o2hb_shutdown_slot allocate local event1 2) queue_node_event add event1 to global o2hb_node_events 3) o2hb_shutdown_slot allocate local event2 4) queue_node_event add event2 to global o2hb_node_events 5) o2hb_run_event_list delete event1 from o2hb_node_events 6) o2hb_run_event_list event1 empty, return 7) o2hb_shutdown_slot event1 lifecycle ends 8) o2hb_fire_callbacks event1 is already *invalid* This patch lets it wait on o2hb_callback_sem when another thread is firing callbacks. And for performance consideration, we only call o2hb_run_event_list when there is an event queued. Signed-off-by: Joyce Signed-off-by: Joseph Qi Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 25b72e82b8fa..363f0dcc924f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -639,16 +639,9 @@ static void o2hb_fire_callbacks(struct o2hb_callback *hbcall, /* Will run the list in order until we process the passed event */ static void o2hb_run_event_list(struct o2hb_node_event *queued_event) { - int empty; struct o2hb_callback *hbcall; struct o2hb_node_event *event; - spin_lock(&o2hb_live_lock); - empty = list_empty(&queued_event->hn_item); - spin_unlock(&o2hb_live_lock); - if (empty) - return; - /* Holding callback sem assures we don't alter the callback * lists when doing this, and serializes ourselves with other * processes wanting callbacks. */ @@ -707,6 +700,7 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) struct o2hb_node_event event = { .hn_item = LIST_HEAD_INIT(event.hn_item), }; struct o2nm_node *node; + int queued = 0; node = o2nm_get_node_by_num(slot->ds_node_num); if (!node) @@ -724,11 +718,13 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot) o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, slot->ds_node_num); + queued = 1; } } spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); o2nm_node_put(node); } @@ -788,6 +784,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; unsigned int slot_dead_ms; int tmp; + int queued = 0; memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); @@ -881,6 +878,7 @@ fire_callbacks: slot->ds_node_num); changed = 1; + queued = 1; } list_add_tail(&slot->ds_live_item, @@ -932,6 +930,7 @@ fire_callbacks: node, slot->ds_node_num); changed = 1; + queued = 1; } /* We don't clear this because the node is still @@ -947,7 +946,8 @@ fire_callbacks: out: spin_unlock(&o2hb_live_lock); - o2hb_run_event_list(&event); + if (queued) + o2hb_run_event_list(&event); if (node) o2nm_node_put(node); From a72e27d3727b383be39498f8b5c9b944d30e0f9b Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 11 Sep 2013 14:20:04 -0700 Subject: [PATCH 024/303] ocfs2: remove unused variable ip in dlmfs_get_root_inode() Variable ip in dlmfs_get_root_inode() is defined but not used. So clean it up. Signed-off-by: Joseph Qi Reviewed-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmfs/dlmfs.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 12bafb7265ce..efa2b3d339e3 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -401,11 +401,8 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); umode_t mode = S_IFDIR | 0755; - struct dlmfs_inode_private *ip; if (inode) { - ip = DLMFS_I(inode); - inode->i_ino = get_next_ino(); inode_init_owner(inode, NULL, mode); inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; From 28e8be31803b19d0d8f76216cb11b480b8a98bec Mon Sep 17 00:00:00 2001 From: Jie Liu Date: Wed, 11 Sep 2013 14:20:05 -0700 Subject: [PATCH 025/303] ocfs2: fix the end cluster offset of FIEMAP Call fiemap ioctl(2) with given start offset as well as an desired mapping range should show extents if possible. However, we somehow figure out the end offset of mapping via 'mapping_end -= cpos' before iterating the extent records which would cause problems if the given fiemap length is too small to a cluster size, e.g, Cluster size 4096: debugfs.ocfs2 1.6.3 Block Size Bits: 12 Cluster Size Bits: 12 The extended fiemap test utility From David: https://gist.github.com/anonymous/6172331 # dd if=/dev/urandom of=/ocfs2/test_file bs=1M count=1000 # ./fiemap /ocfs2/test_file 4096 10 start: 4096, length: 10 File /ocfs2/test_file has 0 extents: # Logical Physical Length Flags ^^^^^ <-- No extent is shown In this case, at ocfs2_fiemap(): cpos == mapping_end == 1. Hence the loop of searching extent records was not executed at all. This patch remove the in question 'mapping_end -= cpos', and loops until the cpos is larger than the mapping_end as usual. # ./fiemap /ocfs2/test_file 4096 10 start: 4096, length: 10 File /ocfs2/test_file has 1 extents: # Logical Physical Length Flags 0: 0000000000000000 0000000056a01000 0000000006a00000 0000 Signed-off-by: Jie Liu Reported-by: David Weber Tested-by: David Weber Cc: Sunil Mushran Cc: Mark Fashen Cc: Joel Becker Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/extent_map.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 4bf2b763467f..767370b656ca 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -781,7 +781,6 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, cpos = map_start >> osb->s_clustersize_bits; mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, map_start + map_len); - mapping_end -= cpos; is_last = 0; while (cpos < mapping_end && !is_last) { u32 fe_flags; From e1403b8edf669ff49bbdf602cc97fefa2760cb15 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:06 -0700 Subject: [PATCH 026/303] include/linux/sched.h: don't use task->pid/tgid in same_thread_group/has_group_leader_pid task_struct->pid/tgid should go away. 1. Change same_thread_group() to use task->signal for comparison. 2. Change has_group_leader_pid(task) to compare task_pid(task) with signal->leader_pid. Signed-off-by: Oleg Nesterov Cc: Michal Hocko Cc: Sergey Dyasly Reviewed-by: "Eric W. Biederman" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index ce1e1c0aaa33..45f254dddafc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2169,15 +2169,15 @@ static inline bool thread_group_leader(struct task_struct *p) * all we care about is that we have a task with the appropriate * pid, we don't actually care if we have the right task. */ -static inline int has_group_leader_pid(struct task_struct *p) +static inline bool has_group_leader_pid(struct task_struct *p) { - return p->pid == p->tgid; + return task_pid(p) == p->signal->leader_pid; } static inline -int same_thread_group(struct task_struct *p1, struct task_struct *p2) +bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { - return p1->tgid == p2->tgid; + return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) From bb8e0e84b30afc9827931c9773d75d5c99fcddff Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:20:07 -0700 Subject: [PATCH 027/303] block: replace strict_strtoul() with kstrtoul() The use of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/osdblk.c | 2 +- drivers/block/rbd.c | 2 +- drivers/block/xen-blkback/xenbus.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index 1bbc681688e4..79aa179305b5 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -598,7 +598,7 @@ static ssize_t class_osdblk_remove(struct class *c, unsigned long ul; struct list_head *tmp; - rc = strict_strtoul(buf, 10, &ul); + rc = kstrtoul(buf, 10, &ul); if (rc) return rc; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 39c51cc7fabc..b22a7d0fe5b7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5132,7 +5132,7 @@ static ssize_t rbd_remove(struct bus_type *bus, bool already = false; int ret; - ret = strict_strtoul(buf, 10, &ul); + ret = kstrtoul(buf, 10, &ul); if (ret) return ret; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index fe5c3cd10c34..c2014a0aa206 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -620,7 +620,7 @@ static void backend_changed(struct xenbus_watch *watch, } /* Front end dir is a number, which is used as the handle. */ - err = strict_strtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); + err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); if (err) return; From ed751e683c563be64322b9bfa0f0f7e5da9bd37c Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:20:08 -0700 Subject: [PATCH 028/303] block/blk-sysfs.c: replace strict_strtoul() with kstrtoul() The usage of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5efc5a647183..3aa5b195f4dd 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -29,7 +29,7 @@ queue_var_store(unsigned long *var, const char *page, size_t count) int err; unsigned long v; - err = strict_strtoul(page, 10, &v); + err = kstrtoul(page, 10, &v); if (err || v > UINT_MAX) return -EINVAL; From bab55417b10c95e6bff8cea315c315adfa009487 Mon Sep 17 00:00:00 2001 From: Cai Zhiyong Date: Wed, 11 Sep 2013 14:20:09 -0700 Subject: [PATCH 029/303] block: support embedded device command line partition Read block device partition table from command line. The partition used for fixed block device (eMMC) embedded device. It is no MBR, save storage space. Bootloader can be easily accessed by absolute address of data on the block device. Users can easily change the partition. This code reference MTD partition, source "drivers/mtd/cmdlinepart.c" About the partition verbose reference "Documentation/block/cmdline-partition.txt" [akpm@linux-foundation.org: fix printk text] [yongjun_wei@trendmicro.com.cn: fix error return code in parse_parts()] Signed-off-by: Cai Zhiyong Cc: Karel Zak Cc: "Wanglin (Albert)" Cc: Marius Groeger Cc: David Woodhouse Cc: Jens Axboe Cc: Brian Norris Cc: Artem Bityutskiy Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/block/cmdline-partition.txt | 39 ++++ block/Kconfig | 6 + block/Makefile | 1 + block/cmdline-parser.c | 250 ++++++++++++++++++++++ block/partitions/Kconfig | 7 + block/partitions/Makefile | 1 + block/partitions/check.c | 4 + block/partitions/cmdline.c | 99 +++++++++ block/partitions/cmdline.h | 2 + include/linux/cmdline-parser.h | 43 ++++ 10 files changed, 452 insertions(+) create mode 100644 Documentation/block/cmdline-partition.txt create mode 100644 block/cmdline-parser.c create mode 100644 block/partitions/cmdline.c create mode 100644 block/partitions/cmdline.h create mode 100644 include/linux/cmdline-parser.h diff --git a/Documentation/block/cmdline-partition.txt b/Documentation/block/cmdline-partition.txt new file mode 100644 index 000000000000..2bbf4cc40c3f --- /dev/null +++ b/Documentation/block/cmdline-partition.txt @@ -0,0 +1,39 @@ +Embedded device command line partition +===================================================================== + +Read block device partition table from command line. +The partition used for fixed block device (eMMC) embedded device. +It is no MBR, save storage space. Bootloader can be easily accessed +by absolute address of data on the block device. +Users can easily change the partition. + +The format for the command line is just like mtdparts: + +blkdevparts=[;] + := :[,] + := [@](part-name) + + + block device disk name, embedded device used fixed block device, + it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0. + + + partition size, in bytes, such as: 512, 1m, 1G. + + + partition start address, in bytes. + +(part-name) + partition name, kernel send uevent with "PARTNAME". application can create + a link to block device partition with the name "PARTNAME". + user space application can access partition by partition name. + +Example: + eMMC disk name is "mmcblk0" and "mmcblk0boot0" + + bootargs: + 'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)' + + dmesg: + mmcblk0: p1(data0) p2(data1) p3() + mmcblk0boot0: p1(boot) p2(kernel) diff --git a/block/Kconfig b/block/Kconfig index a7e40a7c8214..7f38e40fee08 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -99,6 +99,12 @@ config BLK_DEV_THROTTLING See Documentation/cgroups/blkio-controller.txt for more information. +config CMDLINE_PARSER + bool "Block device command line partition parser" + default n + ---help--- + Parsing command line, get the partitions information. + menu "Partition Types" source "block/partitions/Kconfig" diff --git a/block/Makefile b/block/Makefile index 39b76ba66ffd..4fa4be544ece 100644 --- a/block/Makefile +++ b/block/Makefile @@ -18,3 +18,4 @@ obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o +obj-$(CONFIG_CMDLINE_PARSER) += cmdline-parser.o diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c new file mode 100644 index 000000000000..cc2637f8674e --- /dev/null +++ b/block/cmdline-parser.c @@ -0,0 +1,250 @@ +/* + * Parse command line, get partition information + * + * Written by Cai Zhiyong + * + */ +#include +#include +#include + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} + +/* + * add_part() + * 0 success. + * 1 can not add so many partitions. + */ +void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param) + +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, param)) + break; + } +} diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 4cebb2f0d2f4..87a32086535d 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -260,3 +260,10 @@ config SYSV68_PARTITION partition table format used by Motorola Delta machines (using sysv68). Otherwise, say N. + +config CMDLINE_PARTITION + bool "Command line partition support" if PARTITION_ADVANCED + select CMDLINE_PARSER + help + Say Y here if you would read the partitions table from bootargs. + The format for the command line is just like mtdparts. diff --git a/block/partitions/Makefile b/block/partitions/Makefile index 2be4d7ba4e3a..37a95270503c 100644 --- a/block/partitions/Makefile +++ b/block/partitions/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_ACORN_PARTITION) += acorn.o obj-$(CONFIG_AMIGA_PARTITION) += amiga.o obj-$(CONFIG_ATARI_PARTITION) += atari.o obj-$(CONFIG_AIX_PARTITION) += aix.o +obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o obj-$(CONFIG_MAC_PARTITION) += mac.o obj-$(CONFIG_LDM_PARTITION) += ldm.o obj-$(CONFIG_MSDOS_PARTITION) += msdos.o diff --git a/block/partitions/check.c b/block/partitions/check.c index 19ba207ea7d1..9ac1df74f699 100644 --- a/block/partitions/check.c +++ b/block/partitions/check.c @@ -34,6 +34,7 @@ #include "efi.h" #include "karma.h" #include "sysv68.h" +#include "cmdline.h" int warn_no_part = 1; /*This is ugly: should make genhd removable media aware*/ @@ -65,6 +66,9 @@ static int (*check_part[])(struct parsed_partitions *) = { adfspart_check_ADFS, #endif +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif #ifdef CONFIG_EFI_PARTITION efi_partition, /* this must come before msdos */ #endif diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c new file mode 100644 index 000000000000..56cf4ffad51e --- /dev/null +++ b/block/partitions/cmdline.c @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2013 HUAWEI + * Author: Cai Zhiyong + * + * Read block device partition table from command line. + * The partition used for fixed block device (eMMC) embedded device. + * It is no MBR, save storage space. Bootloader can be easily accessed + * by absolute address of data on the block device. + * Users can easily change the partition. + * + * The format for the command line is just like mtdparts. + * + * Verbose config please reference "Documentation/block/cmdline-partition.txt" + * + */ + +#include + +#include "check.h" +#include "cmdline.h" + +static char *cmdline; +static struct cmdline_parts *bdev_parts; + +static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +{ + int label_min; + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + struct parsed_partitions *state = (struct parsed_partitions *)param; + + if (slot >= state->limit) + return 1; + + put_partition(state, slot, subpart->from >> 9, + subpart->size >> 9); + + info = &state->parts[slot].info; + + label_min = min_t(int, sizeof(info->volname) - 1, + sizeof(subpart->name)); + strncpy(info->volname, subpart->name, label_min); + info->volname[label_min] = '\0'; + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + state->parts[slot].has_info = true; + + return 0; +} + +static int __init cmdline_parts_setup(char *s) +{ + cmdline = s; + return 1; +} +__setup("blkdevparts=", cmdline_parts_setup); + +/* + * Purpose: allocate cmdline partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + */ +int cmdline_partition(struct parsed_partitions *state) +{ + sector_t disk_size; + char bdev[BDEVNAME_SIZE]; + struct cmdline_parts *parts; + + if (cmdline) { + if (bdev_parts) + cmdline_parts_free(&bdev_parts); + + if (cmdline_parts_parse(&bdev_parts, cmdline)) { + cmdline = NULL; + return -1; + } + cmdline = NULL; + } + + if (!bdev_parts) + return 0; + + bdevname(state->bdev, bdev); + parts = cmdline_parts_find(bdev_parts, bdev); + if (!parts) + return 0; + + disk_size = get_capacity(state->bdev->bd_disk) << 9; + + cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/cmdline.h b/block/partitions/cmdline.h new file mode 100644 index 000000000000..26e0f8da1414 --- /dev/null +++ b/block/partitions/cmdline.h @@ -0,0 +1,2 @@ + +int cmdline_partition(struct parsed_partitions *state); diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h new file mode 100644 index 000000000000..98e892ef6d5a --- /dev/null +++ b/include/linux/cmdline-parser.h @@ -0,0 +1,43 @@ +/* + * Parsing command line, get the partitions information. + * + * Written by Cai Zhiyong + * + */ +#ifndef CMDLINEPARSEH +#define CMDLINEPARSEH + +#include + +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +void cmdline_parts_free(struct cmdline_parts **parts); + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline); + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev); + +void cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param); + +#endif /* CMDLINEPARSEH */ From c86db975c87976a234f13a0c2d7f931d8ede493b Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:20:10 -0700 Subject: [PATCH 030/303] drivers/block/mg_disk.c: make mg_times_out() static mg_times_out() is used only in this file. Fix the following sparse warning: drivers/block/mg_disk.c:639:6: warning: symbol 'mg_times_out' was not declared. Should it be static? Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/mg_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index a56cfcd5d648..77a60bedd7a3 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -636,7 +636,7 @@ ok_to_write: mg_request(host->breq); } -void mg_times_out(unsigned long data) +static void mg_times_out(unsigned long data) { struct mg_host *host = (struct mg_host *)data; char *name; From e7b18ede4443c0207b9fd849cf604e67c6f38fc9 Mon Sep 17 00:00:00 2001 From: Mike Miller Date: Wed, 11 Sep 2013 14:20:11 -0700 Subject: [PATCH 031/303] cciss: set max scatter gather entries to 32 on P600 At one time we used to set the maximum number of scatter gather elements on all Smart Array controllers to 32. At some point in time the firmware began to write the "appropriate" value for each controller into the config table. The cciss driver would then read that and set h->maxsgentries. h->maxsgentries = readl(&(h->cfgtable->MaxSGElements); On the P600 that value is 544. Under some workloads a significant performance reduction may result. This patch forces the P600 to use only 32 scatter gather elements. Other controllers are not affected. Signed-off-by: Mike Miller Signed-off-by: Dwight (Bud) Brown Signed-off-by: Tomas Henzl Acked-by: Stephen M. Cameron Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/cciss.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 62b6c2cc80b5..d2d95ff5353b 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -4257,6 +4257,13 @@ static void cciss_find_board_params(ctlr_info_t *h) cciss_get_max_perf_mode_cmds(h); h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds; h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); + /* + * The P600 may exhibit poor performnace under some workloads + * if we use the value in the configuration table. Limit this + * controller to MAXSGENTRIES (32) instead. + */ + if (h->board_id == 0x3225103C) + h->maxsgentries = MAXSGENTRIES; /* * Limit in-command s/g elements to 32 save dma'able memory. * Howvever spec says if 0, use 31 From c07303c0af38ffb1e5fd9b5ff37d0798298a7acf Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:20:13 -0700 Subject: [PATCH 032/303] drivers/block/swim.c: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure. Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Jean Delvare Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/swim.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 8ed6ccb748cf..b02d53a399f3 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -924,7 +924,6 @@ static int swim_probe(struct platform_device *dev) return 0; out_kfree: - platform_set_drvdata(dev, NULL); kfree(swd); out_iounmap: iounmap(swim_base); @@ -962,7 +961,6 @@ static int swim_remove(struct platform_device *dev) if (res) release_mem_region(res->start, resource_size(res)); - platform_set_drvdata(dev, NULL); kfree(swd); return 0; From ef0855d334e1e4af7c3e0c42146a8479ea14a5ab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:14 -0700 Subject: [PATCH 033/303] mm: mempolicy: turn vma_set_policy() into vma_dup_policy() Simple cleanup. Every user of vma_set_policy() does the same work, this looks a bit annoying imho. And the new trivial helper which does mpol_dup() + vma_set_policy() to simplify the callers. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 9 +++++++-- kernel/fork.c | 9 +++------ mm/mempolicy.c | 10 ++++++++++ mm/mmap.c | 17 +++++------------ 4 files changed, 25 insertions(+), 20 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0d7df39a5885..b2f897789838 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -91,7 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) } #define vma_policy(vma) ((vma)->vm_policy) -#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol)) static inline void mpol_get(struct mempolicy *pol) { @@ -126,6 +125,7 @@ struct shared_policy { spinlock_t lock; }; +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, @@ -240,7 +240,12 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) } #define vma_policy(vma) NULL -#define vma_set_policy(vma, pol) do {} while(0) + +static inline int +vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + return 0; +} static inline void numa_policy_init(void) { diff --git a/kernel/fork.c b/kernel/fork.c index 84703db06cf3..81ccb4f010c2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; - struct mempolicy *pol; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); @@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); - pol = mpol_dup(vma_policy(mpnt)); - retval = PTR_ERR(pol); - if (IS_ERR(pol)) + retval = vma_dup_policy(mpnt, tmp); + if (retval) goto fail_nomem_policy; - vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; @@ -472,7 +469,7 @@ out: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: - mpol_put(pol); + mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4baf12e534d1..6b1d426731ae 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2065,6 +2065,16 @@ retry_cpuset: } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() diff --git a/mm/mmap.c b/mm/mmap.c index f9c97d10b873..14f6bb4830f7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2380,7 +2380,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { - struct mempolicy *pol; struct vm_area_struct *new; int err = -ENOMEM; @@ -2404,12 +2403,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); + err = vma_dup_policy(vma, new); + if (err) goto out_free_vma; - } - vma_set_policy(new, pol); if (anon_vma_clone(new, vma)) goto out_free_mpol; @@ -2437,7 +2433,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, fput(new->vm_file); unlink_anon_vmas(new); out_free_mpol: - mpol_put(pol); + mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); out_err: @@ -2780,7 +2776,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; bool faulted_in_anon_vma = true; /* @@ -2825,10 +2820,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) + if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; @@ -2843,7 +2836,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_free_mempol: - mpol_put(pol); + mpol_put(vma_policy(new_vma)); out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); return NULL; From ec9bed9d385fd094b20fa0809c50741710afdc74 Mon Sep 17 00:00:00 2001 From: Vladimir Cernov Date: Wed, 11 Sep 2013 14:20:15 -0700 Subject: [PATCH 034/303] mm/madvise.c: fix coding-style errors This fixes following errors: - ERROR: "(foo*)" should be "(foo *)" - ERROR: "foo ** bar" should be "foo **bar" Signed-off-by: Vladimir Cernov Reviewed-by: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 7055883e6e25..936799f042cc 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -42,11 +42,11 @@ static int madvise_need_mmap_write(int behavior) * We can potentially split a vm area into separate * areas, each area with its own behavior. */ -static long madvise_behavior(struct vm_area_struct * vma, +static long madvise_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; int error = 0; pgoff_t pgoff; unsigned long new_flags = vma->vm_flags; @@ -215,8 +215,8 @@ static void force_shm_swapin_readahead(struct vm_area_struct *vma, /* * Schedule all required I/O operations. Do not wait for completion. */ -static long madvise_willneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_willneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; @@ -270,8 +270,8 @@ static long madvise_willneed(struct vm_area_struct * vma, * An interface that causes the system to free clean pages and flush * dirty pages is already available as msync(MS_INVALIDATE). */ -static long madvise_dontneed(struct vm_area_struct * vma, - struct vm_area_struct ** prev, +static long madvise_dontneed(struct vm_area_struct *vma, + struct vm_area_struct **prev, unsigned long start, unsigned long end) { *prev = vma; @@ -459,7 +459,7 @@ madvise_behavior_valid(int behavior) SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; - struct vm_area_struct * vma, *prev; + struct vm_area_struct *vma, *prev; int unmapped_error = 0; int error = -EINVAL; int write; From d6bbbd29b1de2da807753be8a3a992a72aef42de Mon Sep 17 00:00:00 2001 From: Raymond Jennings Date: Wed, 11 Sep 2013 14:20:16 -0700 Subject: [PATCH 035/303] swap: warn when a swap area overflows the maximum size It is possible to swapon a swap area that is too big for the pte width to handle. Presently this failure happens silently. Instead, emit a diagnostic to warn the user. Testing results, root prompt commands and kernel log messages: # lvresize /dev/system/swap --size 16G # mkswap /dev/system/swap # swapon /dev/system/swap Jul 7 04:27:22 warfang kernel: Adding 16777212k swap on /dev/mapper/system-swap. Priority:-1 extents:1 across:16777212k # lvresize /dev/system/swap --size 64G # mkswap /dev/system/swap # swapon /dev/system/swap Jul 7 04:27:22 warfang kernel: Truncating oversized swap area, only using 33554432k out of 67108860k Jul 7 04:27:22 warfang kernel: Adding 33554428k swap on /dev/mapper/system-swap. Priority:-1 extents:1 across:33554428k [akpm@linux-foundation.org: fix warning] Signed-off-by: Raymond Jennings Acked-by: Valdis Kletnieks Reviewed-by: Rik van Riel Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 6cf2e60983b7..b5212eea6c3c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1926,6 +1926,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, int i; unsigned long maxpages; unsigned long swapfilepages; + unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { printk(KERN_ERR "Unable to find swap-space signature\n"); @@ -1968,8 +1969,15 @@ static unsigned long read_swap_header(struct swap_info_struct *p, */ maxpages = swp_offset(pte_to_swp_entry( swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; - if (maxpages > swap_header->info.last_page) { - maxpages = swap_header->info.last_page + 1; + last_page = swap_header->info.last_page; + if (last_page > maxpages) { + printk(KERN_WARNING + "Truncating oversized swap area, only using %luk out of %luk\n", + maxpages << (PAGE_SHIFT - 10), + last_page << (PAGE_SHIFT - 10)); + } + if (maxpages > last_page) { + maxpages = last_page + 1; /* p->max is an unsigned int: don't overflow it */ if ((unsigned int)maxpages == 0) maxpages = UINT_MAX; From 465c47fd8dc44302fed6c4eab8927464744ce08c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:20:17 -0700 Subject: [PATCH 036/303] mm/swapfile.c: convert to pr_foo() A few 80-col gymnastics were cleaned up as a result. Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index b5212eea6c3c..6ef2d15c5fe3 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -527,16 +527,16 @@ static struct swap_info_struct *swap_info_get(swp_entry_t entry) return p; bad_free: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_offset, entry.val); goto out; bad_offset: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_offset, entry.val); goto out; bad_device: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); + pr_err("swap_free: %s%08lx\n", Unused_file, entry.val); goto out; bad_nofile: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_free: %s%08lx\n", Bad_file, entry.val); out: return NULL; } @@ -1929,7 +1929,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, unsigned long last_page; if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) { - printk(KERN_ERR "Unable to find swap-space signature\n"); + pr_err("Unable to find swap-space signature\n"); return 0; } @@ -1943,9 +1943,8 @@ static unsigned long read_swap_header(struct swap_info_struct *p, } /* Check the swap header's sub-version */ if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); + pr_warn("Unable to handle swap header version %d\n", + swap_header->info.version); return 0; } @@ -1971,8 +1970,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; last_page = swap_header->info.last_page; if (last_page > maxpages) { - printk(KERN_WARNING - "Truncating oversized swap area, only using %luk out of %luk\n", + pr_warn("Truncating oversized swap area, only using %luk out of %luk\n", maxpages << (PAGE_SHIFT - 10), last_page << (PAGE_SHIFT - 10)); } @@ -1988,8 +1986,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p, return 0; swapfilepages = i_size_read(inode) >> PAGE_SHIFT; if (swapfilepages && maxpages > swapfilepages) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); + pr_warn("Swap area shorter than signature indicates\n"); return 0; } if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) @@ -2032,7 +2029,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, nr_good_pages = p->pages; } if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); + pr_warn("Empty swap-file\n"); return -EINVAL; } @@ -2186,8 +2183,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (p->flags & SWP_AREA_DISCARD) { int err = discard_swap(p); if (unlikely(err)) - printk(KERN_ERR - "swapon: discard_swap(%p): %d\n", + pr_err("swapon: discard_swap(%p): %d\n", p, err); } } @@ -2200,7 +2196,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; enable_swap_info(p, prio, swap_map, frontswap_map); - printk(KERN_INFO "Adding %uk swap on %s. " + pr_info("Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), @@ -2334,7 +2330,7 @@ out: return err; bad_file: - printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); + pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } From b2c56e4f7d93be3f33a82ec66f0d0f46713ff5f1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:18 -0700 Subject: [PATCH 037/303] mm: shift VM_GROWS* check from mmap_region() to do_mmap_pgoff() mmap() doesn't allow the non-anonymous mappings with VM_GROWS* bit set. In particular this means that mmap_region()->vma_merge(file, vm_flags) must always fail if "vm_flags & VM_GROWS" is set incorrectly. So it does not make sense to check VM_GROWS* after we already allocated the new vma, the only caller, do_mmap_pgoff(), which can pass this flag can do the check itself. And this looks a bit more correct, mmap_region() already unmapped the old mapping at this stage. But if mmap() is going to fail, it should avoid do_munmap() if possible. Note: we check VM_GROWS at the end to ensure that do_mmap_pgoff() won't return EINVAL in the case when it currently returns another error code. Many thanks to Hugh who nacked the buggy v1. Signed-off-by: Oleg Nesterov Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 14f6bb4830f7..6cff7ba24a34 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1302,6 +1302,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!file->f_op || !file->f_op->mmap) return -ENODEV; + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; break; default: @@ -1310,6 +1312,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, } else { switch (flags & MAP_TYPE) { case MAP_SHARED: + if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) + return -EINVAL; /* * Ignore pgoff. */ @@ -1544,11 +1548,7 @@ munmap_back: vma->vm_pgoff = pgoff; INIT_LIST_HEAD(&vma->anon_vma_chain); - error = -EINVAL; /* when rejecting VM_GROWSDOWN|VM_GROWSUP */ - if (file) { - if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) - goto free_vma; if (vm_flags & VM_DENYWRITE) { error = deny_write_access(file); if (error) @@ -1573,8 +1573,6 @@ munmap_back: pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { - if (unlikely(vm_flags & (VM_GROWSDOWN|VM_GROWSUP))) - goto free_vma; error = shmem_zero_setup(vma); if (error) goto free_vma; From 077bf22b5cf233863826afbfa4af9b18650a832d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:19 -0700 Subject: [PATCH 038/303] mm: do_mmap_pgoff: cleanup the usage of file_inode() Simple cleanup. Move "struct inode *inode" variable into "if (file)" block to simplify the code and avoid the unnecessary check. Signed-off-by: Oleg Nesterov Cc: Hugh Dickins Cc: Al Viro Cc: Colin Cross Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 6cff7ba24a34..1e7a3ea23f1a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1202,7 +1202,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long *populate) { struct mm_struct * mm = current->mm; - struct inode *inode; vm_flags_t vm_flags; *populate = 0; @@ -1265,9 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, return -EAGAIN; } - inode = file ? file_inode(file) : NULL; - if (file) { + struct inode *inode = file_inode(file); + switch (flags & MAP_TYPE) { case MAP_SHARED: if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) From e86867720e617774b560dfbc169b7f3d0d490950 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:20 -0700 Subject: [PATCH 039/303] mm: mmap_region: kill correct_wcount/inode, use allow_write_access() correct_wcount and inode in mmap_region() just complicate the code. This boolean was needed previously, when deny_write_access() was called before vma_merge(), now we can simply check VM_DENYWRITE and do allow_write_access() if it is set. allow_write_access() checks file != NULL, so this is safe even if it was possible to use VM_DENYWRITE && !file. Just we need to ensure we use the same file which was deny_write_access()'ed, so the patch also moves "file = vma->vm_file" down after allow_write_access(). Signed-off-by: Oleg Nesterov Cc: Hugh Dickins Cc: Al Viro Cc: Colin Cross Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 1e7a3ea23f1a..13926a5a6901 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1479,11 +1479,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - int correct_wcount = 0; int error; struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - struct inode *inode = file ? file_inode(file) : NULL; /* Check against address space limit. */ if (!may_expand_vm(mm, len >> PAGE_SHIFT)) { @@ -1552,7 +1550,6 @@ munmap_back: error = deny_write_access(file); if (error) goto free_vma; - correct_wcount = 1; } vma->vm_file = get_file(file); error = file->f_op->mmap(file, vma); @@ -1593,11 +1590,10 @@ munmap_back: } vma_link(mm, vma, prev, rb_link, rb_parent); - file = vma->vm_file; - /* Once vma denies write, undo our temporary denial count */ - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + file = vma->vm_file; out: perf_event_mmap(vma); @@ -1616,8 +1612,8 @@ out: return addr; unmap_and_free_vma: - if (correct_wcount) - atomic_inc(&inode->i_writecount); + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); vma->vm_file = NULL; fput(file); From 822518dc56810a0de44cff0f85a227268818749c Mon Sep 17 00:00:00 2001 From: Sunghan Suh Date: Wed, 11 Sep 2013 14:20:22 -0700 Subject: [PATCH 040/303] mm/zswap.c: get swapper address_space by using macro There is a proper macro to get the corresponding swapper address space from a swap entry. Instead of directly accessing "swapper_spaces" array, use the "swap_address_space" macro. Signed-off-by: Sunghan Suh Reviewed-by: Bob Liu Reviewed-by: Wanpeng Li Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zswap.c b/mm/zswap.c index deda2b671e12..efed4c8b7f5b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -409,7 +409,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) { struct page *found_page, *new_page = NULL; - struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; + struct address_space *swapper_space = swap_address_space(entry); int err; *retpage = NULL; From 9824cf9753ecbe8f5b47aa9b2f218207defea211 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Sep 2013 14:20:23 -0700 Subject: [PATCH 041/303] mm: vmstats: tlb flush counters I was investigating some TLB flush scaling issues and realized that we do not have any good methods for figuring out how many TLB flushes we are doing. It would be nice to be able to do these in generic code, but the arch-independent calls don't explicitly specify whether we actually need to do remote flushes or not. In the end, we really need to know if we actually _did_ global vs. local invalidations, so that leaves us with few options other than to muck with the counters from arch-specific code. Signed-off-by: Dave Hansen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/tlb.c | 18 ++++++++++++++---- include/linux/vm_event_item.h | 5 +++++ mm/vmstat.c | 5 +++++ 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 282375f13c7e..f030cbe669a5 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,6 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -130,6 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; + count_vm_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -149,6 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -211,16 +214,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) + if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); - else { + } else { if (has_large_page(mm, start, end)) { local_flush_tlb(); goto flush_all; } /* flush range by one by one 'invlpg' */ - for (addr = start; addr < end; addr += PAGE_SIZE) + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) @@ -256,6 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { + count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -263,6 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { + count_vm_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } @@ -272,8 +280,10 @@ static void do_kernel_range_flush(void *info) unsigned long addr; /* flush range by one by one 'invlpg' */ - for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE_KERNEL); __flush_tlb_single(addr); + } } void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index bd6cf61142be..dc2cdf07ac14 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,6 +70,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif + NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ + NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ + NR_TLB_LOCAL_FLUSH_ALL, + NR_TLB_LOCAL_FLUSH_ONE, + NR_TLB_LOCAL_FLUSH_ONE_KERNEL, NR_VM_EVENT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c2ef4458fa..00382c53f582 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -817,6 +817,11 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif + "nr_tlb_remote_flush", + "nr_tlb_remote_flush_received", + "nr_tlb_local_flush_all", + "nr_tlb_local_flush_one", + "nr_tlb_local_flush_one_kernel", #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; From 6df46865ff8715932e7d42e52cac17e8461758cb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Sep 2013 14:20:24 -0700 Subject: [PATCH 042/303] mm: vmstats: track TLB flush stats on UP too The previous patch doing vmstats for TLB flushes ("mm: vmstats: tlb flush counters") effectively missed UP since arch/x86/mm/tlb.c is only compiled for SMP. UP systems do not do remote TLB flushes, so compile those counters out on UP. arch/x86/kernel/cpu/mtrr/generic.c calls __flush_tlb() directly. This is probably an optimization since both the mtrr code and __flush_tlb() write cr4. It would probably be safe to make that a flush_tlb_all() (and then get these statistics), but the mtrr code is ancient and I'm hesitant to touch it other than to just stick in the counters. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Dave Hansen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/tlbflush.h | 37 +++++++++++++++++++++++++----- arch/x86/kernel/cpu/mtrr/generic.c | 2 ++ arch/x86/mm/tlb.c | 4 +--- include/linux/vm_event_item.h | 3 ++- mm/vmstat.c | 3 ++- 5 files changed, 38 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cf512003e663..e6d90babc245 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -62,6 +62,7 @@ static inline void __flush_tlb_all(void) static inline void __flush_tlb_one(unsigned long addr) { + count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -84,14 +85,38 @@ static inline void __flush_tlb_one(unsigned long addr) #ifndef CONFIG_SMP -#define flush_tlb() __flush_tlb() -#define flush_tlb_all() __flush_tlb_all() -#define local_flush_tlb() __flush_tlb() +/* "_up" is for UniProcessor. + * + * This is a helper for other header functions. *Not* intended to be called + * directly. All global TLB flushes need to either call this, or to bump the + * vm statistics themselves. + */ +static inline void __flush_tlb_up(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb(); +} + +static inline void flush_tlb_all(void) +{ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + __flush_tlb_all(); +} + +static inline void flush_tlb(void) +{ + __flush_tlb_up(); +} + +static inline void local_flush_tlb(void) +{ + __flush_tlb_up(); +} static inline void flush_tlb_mm(struct mm_struct *mm) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_page(struct vm_area_struct *vma, @@ -105,14 +130,14 @@ static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (vma->vm_mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { if (mm == current->active_mm) - __flush_tlb(); + __flush_tlb_up(); } static inline void native_flush_tlb_others(const struct cpumask *cpumask, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index d4cdfa67509e..ce2d0a2c3e4f 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -683,6 +683,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Save MTRR state */ @@ -696,6 +697,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ + count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); __flush_tlb(); /* Intel (P6) standard MTRRs */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index f030cbe669a5..ae699b3bbac8 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -280,10 +280,8 @@ static void do_kernel_range_flush(void *info) unsigned long addr; /* flush range by one by one 'invlpg' */ - for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE_KERNEL); + for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) __flush_tlb_single(addr); - } } void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index dc2cdf07ac14..1855f0a22add 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -70,11 +70,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, #endif +#ifdef CONFIG_SMP NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */ NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */ +#endif NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, - NR_TLB_LOCAL_FLUSH_ONE_KERNEL, NR_VM_EVENT_ITEMS }; diff --git a/mm/vmstat.c b/mm/vmstat.c index 00382c53f582..ca06e9653827 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -817,11 +817,12 @@ const char * const vmstat_text[] = { "thp_zero_page_alloc", "thp_zero_page_alloc_failed", #endif +#ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", +#endif "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", - "nr_tlb_local_flush_one_kernel", #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; From 3dbb95f7895e378514ffefa93cc887fb1bc9df94 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:20:25 -0700 Subject: [PATCH 043/303] mm: replace strict_strtoul() with kstrtoul() The use of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 ++++---- mm/hugetlb.c | 4 ++-- mm/kmemleak.c | 2 +- mm/ksm.c | 6 +++--- mm/slub.c | 8 ++++---- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a92012a71702..8b7fc2025e04 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -417,7 +417,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -444,7 +444,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -470,7 +470,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long pages; - err = strict_strtoul(buf, 10, &pages); + err = kstrtoul(buf, 10, &pages); if (err || !pages || pages > UINT_MAX) return -EINVAL; @@ -538,7 +538,7 @@ static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, int err; unsigned long max_ptes_none; - err = strict_strtoul(buf, 10, &max_ptes_none); + err = kstrtoul(buf, 10, &max_ptes_none); if (err || max_ptes_none > HPAGE_PMD_NR-1) return -EINVAL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b60f33080a28..6e514831bda5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1526,7 +1526,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &count); + err = kstrtoul(buf, 10, &count); if (err) goto out; @@ -1617,7 +1617,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, if (h->order >= MAX_ORDER) return -EINVAL; - err = strict_strtoul(buf, 10, &input); + err = kstrtoul(buf, 10, &input); if (err) return err; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index c8d7f3110fd0..e126b0ef9ad2 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1639,7 +1639,7 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - ret = strict_strtoul(buf + 5, 0, &secs); + ret = kstrtoul(buf + 5, 0, &secs); if (ret < 0) goto out; stop_scan_thread(); diff --git a/mm/ksm.c b/mm/ksm.c index b6afe0c440d8..0bea2b262a47 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2194,7 +2194,7 @@ static ssize_t sleep_millisecs_store(struct kobject *kobj, unsigned long msecs; int err; - err = strict_strtoul(buf, 10, &msecs); + err = kstrtoul(buf, 10, &msecs); if (err || msecs > UINT_MAX) return -EINVAL; @@ -2217,7 +2217,7 @@ static ssize_t pages_to_scan_store(struct kobject *kobj, int err; unsigned long nr_pages; - err = strict_strtoul(buf, 10, &nr_pages); + err = kstrtoul(buf, 10, &nr_pages); if (err || nr_pages > UINT_MAX) return -EINVAL; @@ -2239,7 +2239,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, int err; unsigned long flags; - err = strict_strtoul(buf, 10, &flags); + err = kstrtoul(buf, 10, &flags); if (err || flags > UINT_MAX) return -EINVAL; if (flags > KSM_RUN_UNMERGE) diff --git a/mm/slub.c b/mm/slub.c index e3ba1f2cf60c..51df8272cfaf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4420,7 +4420,7 @@ static ssize_t order_store(struct kmem_cache *s, unsigned long order; int err; - err = strict_strtoul(buf, 10, &order); + err = kstrtoul(buf, 10, &order); if (err) return err; @@ -4448,7 +4448,7 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, unsigned long min; int err; - err = strict_strtoul(buf, 10, &min); + err = kstrtoul(buf, 10, &min); if (err) return err; @@ -4468,7 +4468,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, unsigned long objects; int err; - err = strict_strtoul(buf, 10, &objects); + err = kstrtoul(buf, 10, &objects); if (err) return err; if (objects && !kmem_cache_has_cpu_partial(s)) @@ -4784,7 +4784,7 @@ static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s, unsigned long ratio; int err; - err = strict_strtoul(buf, 10, &ratio); + err = kstrtoul(buf, 10, &ratio); if (err) return err; From a7e833182a926ae5bc03204dbd00b0bb5539088b Mon Sep 17 00:00:00 2001 From: Jerry Zhou Date: Wed, 11 Sep 2013 14:20:26 -0700 Subject: [PATCH 044/303] mm: fix negative left shift count when PAGE_SHIFT > 20 When PAGE_SHIFT > 20, the result of "20 - PAGE_SHIFT" is negative. The previous calculating here will generate an unexpected result. In addition, if PAGE_SIZE >= 1MB, The memory size of "numentries" was already integral multiple of 1MB. Signed-off-by: Jerry Zhou Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c2b59dbda196..116bab1c2cf5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5745,9 +5745,10 @@ void *__init alloc_large_system_hash(const char *tablename, if (!numentries) { /* round applicable memory size up to nearest megabyte */ numentries = nr_kernel_pages; - numentries += (1UL << (20 - PAGE_SHIFT)) - 1; - numentries >>= 20 - PAGE_SHIFT; - numentries <<= 20 - PAGE_SHIFT; + + /* It isn't necessary when PAGE_SIZE >= 1MB */ + if (PAGE_SHIFT < 20) + numentries = round_up(numentries, (1<<20)/PAGE_SIZE); /* limit to 1 bucket per 2^scale bytes of low memory */ if (scale > PAGE_SHIFT) From 15ca220e1a63af06e000691e4ae1beaba5430c32 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 11 Sep 2013 14:20:27 -0700 Subject: [PATCH 045/303] mm/page_alloc.c: use '__paginginit' instead of '__init' set_pageblock_order() may be called when memory hotplug, so need use '__paginginit' instead of '__init'. The related warning: The function __meminit .free_area_init_node() references a function __init .set_pageblock_order(). If .set_pageblock_order is only used by .free_area_init_node then annotate .set_pageblock_order with a matching annotation. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 116bab1c2cf5..6cf157637df3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4586,7 +4586,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __init set_pageblock_order(void) +void __paginginit set_pageblock_order(void) { unsigned int order; @@ -4614,7 +4614,7 @@ void __init set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __init set_pageblock_order(void) +void __paginginit set_pageblock_order(void) { } From 2a8f9449343260373398d59228a62a4332ea513a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:28 -0700 Subject: [PATCH 046/303] swap: change block allocation algorithm for SSD I'm using a fast SSD to do swap. scan_swap_map() sometimes uses up to 20~30% CPU time (when cluster is hard to find, the CPU time can be up to 80%), which becomes a bottleneck. scan_swap_map() scans a byte array to search a 256 page cluster, which is very slow. Here I introduced a simple algorithm to search cluster. Since we only care about 256 pages cluster, we can just use a counter to track if a cluster is free. Every 256 pages use one int to store the counter. If the counter of a cluster is 0, the cluster is free. All free clusters will be added to a list, so searching cluster is very efficient. With this, scap_swap_map() overhead disappears. This might help low end SD card swap too. Because if the cluster is aligned, SD firmware can do flash erase more efficiently. We only enable the algorithm for SSD. Hard disk swap isn't fast enough and has downside with the algorithm which might introduce regression (see below). The patch slightly changes which cluster is choosen. It always adds free cluster to list tail. This can help wear leveling for low end SSD too. And if no cluster found, the scan_swap_map() will do search from the end of last cluster. So if no cluster found, the scan_swap_map() will do search from the end of last free cluster, which is random. For SSD, this isn't a problem at all. Another downside is the cluster must be aligned to 256 pages, which will reduce the chance to find a cluster. I would expect this isn't a big problem for SSD because of the non-seek penality. (And this is the reason I only enable the algorithm for SSD). Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Hugh Dickins Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 20 +++ mm/swapfile.c | 286 +++++++++++++++++++++++++++++++++++++------ 2 files changed, 269 insertions(+), 37 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d95cde5e257d..cb5baebf31d6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -181,6 +181,23 @@ enum { #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +/* + * We use this to track usage of a cluster. A cluster is a block of swap disk + * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All + * free clusters are organized into a list. We fetch an entry from the list to + * get a free cluster. + * + * The data field stores next cluster if the cluster is free or cluster usage + * counter otherwise. The flags field determines if a cluster is free. This is + * protected by swap_info_struct.lock. + */ +struct swap_cluster_info { + unsigned int data:24; + unsigned int flags:8; +}; +#define CLUSTER_FLAG_FREE 1 /* This cluster is free */ +#define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ + /* * The in-memory structure used to track swap areas. */ @@ -191,6 +208,9 @@ struct swap_info_struct { signed char next; /* next type on the swap list */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ + struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ + struct swap_cluster_info free_cluster_head; /* free cluster list head */ + struct swap_cluster_info free_cluster_tail; /* free cluster list tail */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ef2d15c5fe3..d1fbeb486de5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -184,6 +184,134 @@ static int wait_for_discard(void *word) #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 +static inline void cluster_set_flag(struct swap_cluster_info *info, + unsigned int flag) +{ + info->flags = flag; +} + +static inline unsigned int cluster_count(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_count(struct swap_cluster_info *info, + unsigned int c) +{ + info->data = c; +} + +static inline void cluster_set_count_flag(struct swap_cluster_info *info, + unsigned int c, unsigned int f) +{ + info->flags = f; + info->data = c; +} + +static inline unsigned int cluster_next(struct swap_cluster_info *info) +{ + return info->data; +} + +static inline void cluster_set_next(struct swap_cluster_info *info, + unsigned int n) +{ + info->data = n; +} + +static inline void cluster_set_next_flag(struct swap_cluster_info *info, + unsigned int n, unsigned int f) +{ + info->flags = f; + info->data = n; +} + +static inline bool cluster_is_free(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_FREE; +} + +static inline bool cluster_is_null(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_NEXT_NULL; +} + +static inline void cluster_set_null(struct swap_cluster_info *info) +{ + info->flags = CLUSTER_FLAG_NEXT_NULL; + info->data = 0; +} + +/* + * The cluster corresponding to page_nr will be used. The cluster will be + * removed from free cluster list and its usage counter will be increased. + */ +static void inc_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + if (cluster_is_free(&cluster_info[idx])) { + VM_BUG_ON(cluster_next(&p->free_cluster_head) != idx); + cluster_set_next_flag(&p->free_cluster_head, + cluster_next(&cluster_info[idx]), 0); + if (cluster_next(&p->free_cluster_tail) == idx) { + cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->free_cluster_head); + } + cluster_set_count_flag(&cluster_info[idx], 0, 0); + } + + VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) + 1); +} + +/* + * The cluster corresponding to page_nr decreases one usage. If the usage + * counter becomes 0, which means no page in the cluster is in using, we can + * optionally discard the cluster and add it to free cluster list. + */ +static void dec_cluster_info_page(struct swap_info_struct *p, + struct swap_cluster_info *cluster_info, unsigned long page_nr) +{ + unsigned long idx = page_nr / SWAPFILE_CLUSTER; + + if (!cluster_info) + return; + + VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0); + cluster_set_count(&cluster_info[idx], + cluster_count(&cluster_info[idx]) - 1); + + if (cluster_count(&cluster_info[idx]) == 0) { + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } else { + unsigned int tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, idx, 0); + } + } +} + +/* + * It's possible scan_swap_map() uses a free cluster in the middle of free + * cluster list. Avoiding such abuse to avoid list corruption. + */ +static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, + unsigned long offset) +{ + offset /= SWAPFILE_CLUSTER; + return !cluster_is_null(&si->free_cluster_head) && + offset != cluster_next(&si->free_cluster_head) && + cluster_is_free(&si->cluster_info[offset]); +} + static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) { @@ -225,6 +353,25 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->lowest_alloc = si->max; si->highest_alloc = 0; } +check_cluster: + if (!cluster_is_null(&si->free_cluster_head)) { + offset = cluster_next(&si->free_cluster_head) * + SWAPFILE_CLUSTER; + last_in_cluster = offset + SWAPFILE_CLUSTER - 1; + si->cluster_next = offset; + si->cluster_nr = SWAPFILE_CLUSTER - 1; + found_free_cluster = 1; + goto checks; + } else if (si->cluster_info) { + /* + * Checking free cluster is fast enough, we can do the + * check every time + */ + si->cluster_nr = 0; + si->lowest_alloc = 0; + goto checks; + } + spin_unlock(&si->lock); /* @@ -285,6 +432,8 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } checks: + if (scan_swap_map_recheck_cluster(si, offset)) + goto check_cluster; if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -317,6 +466,7 @@ checks: si->highest_bit = 0; } si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -600,6 +750,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, /* free if no reference */ if (!usage) { + dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -1524,7 +1675,8 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) } static void _enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + struct swap_cluster_info *cluster_info) { int i, prev; @@ -1533,6 +1685,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; + p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; @@ -1553,12 +1706,13 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, static void enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, + struct swap_cluster_info *cluster_info, unsigned long *frontswap_map) { frontswap_init(p->type, frontswap_map); spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, prio, swap_map); + _enable_swap_info(p, prio, swap_map, cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1567,7 +1721,7 @@ static void reinsert_swap_info(struct swap_info_struct *p) { spin_lock(&swap_lock); spin_lock(&p->lock); - _enable_swap_info(p, p->prio, p->swap_map); + _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info); spin_unlock(&p->lock); spin_unlock(&swap_lock); } @@ -1576,6 +1730,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct *p = NULL; unsigned char *swap_map; + struct swap_cluster_info *cluster_info; unsigned long *frontswap_map; struct file *swap_file, *victim; struct address_space *mapping; @@ -1675,6 +1830,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->max = 0; swap_map = p->swap_map; p->swap_map = NULL; + cluster_info = p->cluster_info; + p->cluster_info = NULL; p->flags = 0; frontswap_map = frontswap_map_get(p); frontswap_map_set(p, NULL); @@ -1683,6 +1840,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_invalidate_area(type); mutex_unlock(&swapon_mutex); vfree(swap_map); + vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -2000,15 +2158,21 @@ static unsigned long read_swap_header(struct swap_info_struct *p, static int setup_swap_map_and_extents(struct swap_info_struct *p, union swap_header *swap_header, unsigned char *swap_map, + struct swap_cluster_info *cluster_info, unsigned long maxpages, sector_t *span) { int i; unsigned int nr_good_pages; int nr_extents; + unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); + unsigned long idx = p->cluster_next / SWAPFILE_CLUSTER; nr_good_pages = maxpages - 1; /* omit header page */ + cluster_set_null(&p->free_cluster_head); + cluster_set_null(&p->free_cluster_tail); + for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; if (page_nr == 0 || page_nr > swap_header->info.last_page) @@ -2016,11 +2180,25 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, if (page_nr < maxpages) { swap_map[page_nr] = SWAP_MAP_BAD; nr_good_pages--; + /* + * Haven't marked the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, page_nr); } } + /* Haven't marked the cluster free yet, no list operation involved */ + for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) + inc_cluster_info_page(p, cluster_info, i); + if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; + /* + * Not mark the cluster free yet, no list + * operation involved + */ + inc_cluster_info_page(p, cluster_info, 0); p->max = maxpages; p->pages = nr_good_pages; nr_extents = setup_swap_extents(p, span); @@ -2033,6 +2211,30 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return -EINVAL; } + if (!cluster_info) + return nr_extents; + + for (i = 0; i < nr_clusters; i++) { + if (!cluster_count(&cluster_info[idx])) { + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&p->free_cluster_head)) { + cluster_set_next_flag(&p->free_cluster_head, + idx, 0); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&p->free_cluster_tail); + cluster_set_next(&cluster_info[tail], idx); + cluster_set_next_flag(&p->free_cluster_tail, + idx, 0); + } + } + idx++; + if (idx == nr_clusters) + idx = 0; + } return nr_extents; } @@ -2064,6 +2266,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + struct swap_cluster_info *cluster_info = NULL; unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2137,13 +2340,28 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + /* + * select a random position to start with to help wear leveling + * SSD + */ + p->cluster_next = 1 + (prandom_u32() % p->highest_bit); + + cluster_info = vzalloc(DIV_ROUND_UP(maxpages, + SWAPFILE_CLUSTER) * sizeof(*cluster_info)); + if (!cluster_info) { + error = -ENOMEM; + goto bad_swap; + } + } error = swap_cgroup_swapon(p->type, maxpages); if (error) goto bad_swap; nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map, - maxpages, &span); + cluster_info, maxpages, &span); if (unlikely(nr_extents < 0)) { error = nr_extents; goto bad_swap; @@ -2152,40 +2370,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (frontswap_enabled) frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); - if (p->bdev) { - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (prandom_u32() % p->highest_bit); - } + if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); - if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { - /* - * When discard is enabled for swap with no particular - * policy flagged, we set all swap discard flags here in - * order to sustain backward compatibility with older - * swapon(8) releases. - */ - p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | - SWP_PAGE_DISCARD); + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; - /* - * By flagging sys_swapon, a sysadmin can tell us to - * either do single-time area discards only, or to just - * perform discards for released swap page-clusters. - * Now it's time to adjust the p->flags accordingly. - */ - if (swap_flags & SWAP_FLAG_DISCARD_ONCE) - p->flags &= ~SWP_PAGE_DISCARD; - else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) - p->flags &= ~SWP_AREA_DISCARD; - - /* issue a swapon-time discard if it's still required */ - if (p->flags & SWP_AREA_DISCARD) { - int err = discard_swap(p); - if (unlikely(err)) - pr_err("swapon: discard_swap(%p): %d\n", - p, err); - } + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + pr_err("swapon: discard_swap(%p): %d\n", + p, err); } } @@ -2194,7 +2405,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map, frontswap_map); + enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map); pr_info("Adding %uk swap on %s. " "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", @@ -2226,6 +2437,7 @@ bad_swap: p->flags = 0; spin_unlock(&swap_lock); vfree(swap_map); + vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { mutex_unlock(&inode->i_mutex); From 815c2c543d3aeb914a361f981440ece552778724 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:30 -0700 Subject: [PATCH 047/303] swap: make swap discard async swap can do cluster discard for SSD, which is good, but there are some problems here: 1. swap do the discard just before page reclaim gets a swap entry and writes the disk sectors. This is useless for high end SSD, because an overwrite to a sector implies a discard to original sector too. A discard + overwrite == overwrite. 2. the purpose of doing discard is to improve SSD firmware garbage collection. Idealy we should send discard as early as possible, so firmware can do something smart. Sending discard just after swap entry is freed is considered early compared to sending discard before write. Of course, if workload is already bound to gc speed, sending discard earlier or later doesn't make 3. block discard is a sync API, which will delay scan_swap_map() significantly. 4. Write and discard command can be executed parallel in PCIe SSD. Making swap discard async can make execution more efficiently. This patch makes swap discard async and moves discard to where swap entry is freed. Discard and write have no dependence now, so above issues can be avoided. Idealy we should do discard for any freed sectors, but some SSD discard is very slow. This patch still does discard for a whole cluster. My test does a several round of 'mmap, write, unmap', which will trigger a lot of swap discard. In a fusionio card, with this patch, the test runtime is reduced to 18% of the time without it, so around 5.5x faster. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Hugh Dickins Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 20 +++-- mm/swapfile.c | 192 +++++++++++++++++++++++++------------------ 2 files changed, 125 insertions(+), 87 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index cb5baebf31d6..8a3c4a1caa14 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -217,8 +217,6 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ - unsigned int lowest_alloc; /* while preparing discard cluster */ - unsigned int highest_alloc; /* while preparing discard cluster */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ @@ -232,14 +230,18 @@ struct swap_info_struct { * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, - * cluster_nr, lowest_alloc and - * highest_alloc. other fields are only - * changed at swapon/swapoff, so are - * protected by swap_lock. changing - * flags need hold this lock and - * swap_lock. If both locks need hold, - * hold swap_lock first. + * cluster_nr, lowest_alloc, + * highest_alloc, free/discard cluster + * list. other fields are only changed + * at swapon/swapoff, so are protected + * by swap_lock. changing flags need + * hold this lock and swap_lock. If + * both locks need hold, hold swap_lock + * first. */ + struct work_struct discard_work; /* discard worker */ + struct swap_cluster_info discard_cluster_head; /* list head of discard clusters */ + struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ }; struct swap_list_t { diff --git a/mm/swapfile.c b/mm/swapfile.c index d1fbeb486de5..dac47c66055c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -175,12 +175,6 @@ static void discard_swap_cluster(struct swap_info_struct *si, } } -static int wait_for_discard(void *word) -{ - schedule(); - return 0; -} - #define SWAPFILE_CLUSTER 256 #define LATENCY_LIMIT 256 @@ -242,6 +236,90 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +/* Add a cluster to discard list and schedule it to do discard */ +static void swap_cluster_schedule_discard(struct swap_info_struct *si, + unsigned int idx) +{ + /* + * If scan_swap_map() can't find a free cluster, it will check + * si->swap_map directly. To make sure the discarding cluster isn't + * taken by scan_swap_map(), mark the swap entries bad (occupied). It + * will be cleared after discard + */ + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + SWAP_MAP_BAD, SWAPFILE_CLUSTER); + + if (cluster_is_null(&si->discard_cluster_head)) { + cluster_set_next_flag(&si->discard_cluster_head, + idx, 0); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } else { + unsigned int tail = cluster_next(&si->discard_cluster_tail); + cluster_set_next(&si->cluster_info[tail], idx); + cluster_set_next_flag(&si->discard_cluster_tail, + idx, 0); + } + + schedule_work(&si->discard_work); +} + +/* + * Doing discard actually. After a cluster discard is finished, the cluster + * will be added to free cluster list. caller should hold si->lock. +*/ +static void swap_do_scheduled_discard(struct swap_info_struct *si) +{ + struct swap_cluster_info *info; + unsigned int idx; + + info = si->cluster_info; + + while (!cluster_is_null(&si->discard_cluster_head)) { + idx = cluster_next(&si->discard_cluster_head); + + cluster_set_next_flag(&si->discard_cluster_head, + cluster_next(&info[idx]), 0); + if (cluster_next(&si->discard_cluster_tail) == idx) { + cluster_set_null(&si->discard_cluster_head); + cluster_set_null(&si->discard_cluster_tail); + } + spin_unlock(&si->lock); + + discard_swap_cluster(si, idx * SWAPFILE_CLUSTER, + SWAPFILE_CLUSTER); + + spin_lock(&si->lock); + cluster_set_flag(&info[idx], CLUSTER_FLAG_FREE); + if (cluster_is_null(&si->free_cluster_head)) { + cluster_set_next_flag(&si->free_cluster_head, + idx, 0); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } else { + unsigned int tail; + + tail = cluster_next(&si->free_cluster_tail); + cluster_set_next(&info[tail], idx); + cluster_set_next_flag(&si->free_cluster_tail, + idx, 0); + } + memset(si->swap_map + idx * SWAPFILE_CLUSTER, + 0, SWAPFILE_CLUSTER); + } +} + +static void swap_discard_work(struct work_struct *work) +{ + struct swap_info_struct *si; + + si = container_of(work, struct swap_info_struct, discard_work); + + spin_lock(&si->lock); + swap_do_scheduled_discard(si); + spin_unlock(&si->lock); +} + /* * The cluster corresponding to page_nr will be used. The cluster will be * removed from free cluster list and its usage counter will be increased. @@ -287,6 +365,16 @@ static void dec_cluster_info_page(struct swap_info_struct *p, cluster_count(&cluster_info[idx]) - 1); if (cluster_count(&cluster_info[idx]) == 0) { + /* + * If the swap is discardable, prepare discard the cluster + * instead of free it immediately. The cluster will be freed + * after discard. + */ + if (p->flags & SWP_PAGE_DISCARD) { + swap_cluster_schedule_discard(p, idx); + return; + } + cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE); if (cluster_is_null(&p->free_cluster_head)) { cluster_set_next_flag(&p->free_cluster_head, idx, 0); @@ -319,7 +407,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned long scan_base; unsigned long last_in_cluster = 0; int latency_ration = LATENCY_LIMIT; - int found_free_cluster = 0; /* * We try to cluster swap pages by allocating them sequentially @@ -340,19 +427,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_PAGE_DISCARD) { - /* - * Start range check on racing allocations, in case - * they overlap the cluster we eventually decide on - * (we scan without swap_lock to allow preemption). - * It's hardly conceivable that cluster_nr could be - * wrapped during our scan, but don't depend on it. - */ - if (si->lowest_alloc) - goto checks; - si->lowest_alloc = si->max; - si->highest_alloc = 0; - } check_cluster: if (!cluster_is_null(&si->free_cluster_head)) { offset = cluster_next(&si->free_cluster_head) * @@ -360,15 +434,27 @@ check_cluster: last_in_cluster = offset + SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } else if (si->cluster_info) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + if (!cluster_is_null(&si->discard_cluster_head)) { + si->cluster_nr = 0; + swap_do_scheduled_discard(si); + scan_base = offset = si->cluster_next; + if (!si->cluster_nr) + goto check_cluster; + si->cluster_nr--; + goto checks; + } + /* * Checking free cluster is fast enough, we can do the * check every time */ si->cluster_nr = 0; - si->lowest_alloc = 0; goto checks; } @@ -395,7 +481,6 @@ check_cluster: offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -416,7 +501,6 @@ check_cluster: offset -= SWAPFILE_CLUSTER - 1; si->cluster_next = offset; si->cluster_nr = SWAPFILE_CLUSTER - 1; - found_free_cluster = 1; goto checks; } if (unlikely(--latency_ration < 0)) { @@ -428,7 +512,6 @@ check_cluster: offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; - si->lowest_alloc = 0; } checks: @@ -470,59 +553,6 @@ checks: si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; - if (si->lowest_alloc) { - /* - * Only set when SWP_PAGE_DISCARD, and there's a scan - * for a free cluster in progress or just completed. - */ - if (found_free_cluster) { - /* - * To optimize wear-levelling, discard the - * old data of the cluster, taking care not to - * discard any of its pages that have already - * been allocated by racing tasks (offset has - * already stepped over any at the beginning). - */ - if (offset < si->highest_alloc && - si->lowest_alloc <= last_in_cluster) - last_in_cluster = si->lowest_alloc - 1; - si->flags |= SWP_DISCARDING; - spin_unlock(&si->lock); - - if (offset < last_in_cluster) - discard_swap_cluster(si, offset, - last_in_cluster - offset + 1); - - spin_lock(&si->lock); - si->lowest_alloc = 0; - si->flags &= ~SWP_DISCARDING; - - smp_mb(); /* wake_up_bit advises this */ - wake_up_bit(&si->flags, ilog2(SWP_DISCARDING)); - - } else if (si->flags & SWP_DISCARDING) { - /* - * Delay using pages allocated by racing tasks - * until the whole discard has been issued. We - * could defer that delay until swap_writepage, - * but it's easier to keep this self-contained. - */ - spin_unlock(&si->lock); - wait_on_bit(&si->flags, ilog2(SWP_DISCARDING), - wait_for_discard, TASK_UNINTERRUPTIBLE); - spin_lock(&si->lock); - } else { - /* - * Note pages allocated by racing tasks while - * scan for a free cluster is in progress, so - * that its final discard can exclude them. - */ - if (offset < si->lowest_alloc) - si->lowest_alloc = offset; - if (offset > si->highest_alloc) - si->highest_alloc = offset; - } - } return offset; scan: @@ -1806,6 +1836,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out_dput; } + flush_work(&p->discard_work); + destroy_swap_extents(p); if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); @@ -2172,6 +2204,8 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, cluster_set_null(&p->free_cluster_head); cluster_set_null(&p->free_cluster_tail); + cluster_set_null(&p->discard_cluster_head); + cluster_set_null(&p->discard_cluster_tail); for (i = 0; i < swap_header->info.nr_badpages; i++) { unsigned int page_nr = swap_header->info.badpages[i]; @@ -2281,6 +2315,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (IS_ERR(p)) return PTR_ERR(p); + INIT_WORK(&p->discard_work, swap_discard_work); + name = getname(specialfile); if (IS_ERR(name)) { error = PTR_ERR(name); From edfe23dac3e2981277087b05bec7fec7790d1835 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:31 -0700 Subject: [PATCH 048/303] swap: fix races exposed by swap discard The previous patch can expose races, according to Hugh: swapoff was sometimes failing with "Cannot allocate memory", coming from try_to_unuse()'s -ENOMEM: it needs to allow for swap_duplicate() failing on a free entry temporarily SWAP_MAP_BAD while being discarded. We should use ACCESS_ONCE() there, and whenever accessing swap_map locklessly; but rather than peppering it throughout try_to_unuse(), just declare *swap_map with volatile. try_to_unuse() is accustomed to *swap_map going down racily, but not necessarily to it jumping up from 0 to SWAP_MAP_BAD: we'll be safer to prevent that transition once SWP_WRITEOK is switched off, when it's a waste of time to issue discards anyway (swapon can do a whole discard). Another issue is: In swapin_readahead(), read_swap_cache_async() can read a bad swap entry, because we don't check if readahead swap entry is bad. This doesn't break anything but such swapin page is wasteful and can only be freed at page reclaim. We should avoid read such swap entry. And in discard, we mark swap entry SWAP_MAP_BAD and then switch it to normal when discard is finished. If readahead reads such swap entry, we have the same issue, so we much check if swap entry is bad too. Thanks Hugh to inspire swapin_readahead could use bad swap entry. [include Hugh's patch 'swap: fix swapoff ENOMEMs from discard'] Signed-off-by: Shaohua Li Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index dac47c66055c..98e52e373bd8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -370,7 +370,8 @@ static void dec_cluster_info_page(struct swap_info_struct *p, * instead of free it immediately. The cluster will be freed * after discard. */ - if (p->flags & SWP_PAGE_DISCARD) { + if ((p->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) == + (SWP_WRITEOK | SWP_PAGE_DISCARD)) { swap_cluster_schedule_discard(p, idx); return; } @@ -1288,7 +1289,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, else continue; } - count = si->swap_map[i]; + count = ACCESS_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) break; } @@ -1308,7 +1309,11 @@ int try_to_unuse(unsigned int type, bool frontswap, { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned char *swap_map; + volatile unsigned char *swap_map; /* swap_map is accessed without + * locking. Mark it as volatile + * to prevent compiler doing + * something odd. + */ unsigned char swcount; struct page *page; swp_entry_t entry; @@ -1359,7 +1364,15 @@ int try_to_unuse(unsigned int type, bool frontswap, * reused since sys_swapoff() already disabled * allocation from here, or alloc_page() failed. */ - if (!*swap_map) + swcount = *swap_map; + /* + * We don't hold lock here, so the swap entry could be + * SWAP_MAP_BAD (when the cluster is discarding). + * Instead of fail out, We can just skip the swap + * entry because swapoff will wait for discarding + * finish anyway. + */ + if (!swcount || swcount == SWAP_MAP_BAD) continue; retval = -ENOMEM; break; @@ -2543,6 +2556,16 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage) goto unlock_out; count = p->swap_map[offset]; + + /* + * swapin_readahead() doesn't check if a swap entry is valid, so the + * swap entry could be SWAP_MAP_BAD. Check here with lock held. + */ + if (unlikely(swap_count(count) == SWAP_MAP_BAD)) { + err = -ENOENT; + goto unlock_out; + } + has_cache = count & SWAP_HAS_CACHE; count &= ~SWAP_HAS_CACHE; err = 0; From ebc2a1a69111eadfeda8487e577f1a5d42ef0dae Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 11 Sep 2013 14:20:32 -0700 Subject: [PATCH 049/303] swap: make cluster allocation per-cpu swap cluster allocation is to get better request merge to improve performance. But the cluster is shared globally, if multiple tasks are doing swap, this will cause interleave disk access. While multiple tasks swap is quite common, for example, each numa node has a kswapd thread doing swap and multiple threads/processes doing direct page reclaim. ioscheduler can't help too much here, because tasks don't send swapout IO down to block layer in the meantime. Block layer does merge some IOs, but a lot not, depending on how many tasks are doing swapout concurrently. In practice, I've seen a lot of small size IO in swapout workloads. We makes the cluster allocation per-cpu here. The interleave disk access issue goes away. All tasks swapout to their own cluster, so swapout will become sequential, which can be easily merged to big size IO. If one CPU can't get its per-cpu cluster (for example, there is no free cluster anymore in the swap), it will fallback to scan swap_map. The CPU can still continue swap. We don't need recycle free swap entries of other CPUs. In my test (swap to a 2-disk raid0 partition), this improves around 10% swapout throughput, and request size is increased significantly. How does this impact swap readahead is uncertain though. On one side, page reclaim always isolates and swaps several adjancent pages, this will make page reclaim write the pages sequentially and benefit readahead. On the other side, several CPU write pages interleave means the pages don't live _sequentially_ but relatively _near_. In the per-cpu allocation case, if adjancent pages are written by different cpus, they will live relatively _far_. So how this impacts swap readahead depends on how many pages page reclaim isolates and swaps one time. If the number is big, this patch will benefit swap readahead. Of course, this is about sequential access pattern. The patch has no impact for random access pattern, because the new cluster allocation algorithm is just for SSD. Alternative solution is organizing swap layout to be per-mm instead of this per-cpu approach. In the per-mm layout, we allocate a disk range for each mm, so pages of one mm live in swap disk adjacently. per-mm layout has potential issues of lock contention if multiple reclaimers are swap pages from one mm. For a sequential workload, per-mm layout is better to implement swap readahead, because pages from the mm are adjacent in disk. But per-cpu layout isn't very bad in this workload, as page reclaim always isolates and swaps several pages one time, such pages will still live in disk sequentially and readahead can utilize this. For a random workload, per-mm layout isn't beneficial of request merge, because it's quite possible pages from different mm are swapout in the meantime and IO can't be merged in per-mm layout. while with per-cpu layout we can merge requests from any mm. Considering random workload is more popular in workloads with swap (and per-cpu approach isn't too bad for sequential workload too), I'm choosing per-cpu layout. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li Cc: Rik van Riel Cc: Minchan Kim Cc: Kyungmin Park Cc: Hugh Dickins Cc: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 ++++ mm/swapfile.c | 125 +++++++++++++++++++++++++++++++------------ 2 files changed, 102 insertions(+), 34 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 8a3c4a1caa14..24db9142e93b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -198,6 +198,16 @@ struct swap_cluster_info { #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +/* + * We assign a cluster to each CPU, so each CPU can allocate swap entry from + * its own cluster and swapout sequentially. The purpose is to optimize swapout + * throughput. + */ +struct percpu_cluster { + struct swap_cluster_info index; /* Current cluster index */ + unsigned int next; /* Likely next allocation offset */ +}; + /* * The in-memory structure used to track swap areas. */ @@ -217,6 +227,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 98e52e373bd8..3963fc24fcc1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -392,13 +392,78 @@ static void dec_cluster_info_page(struct swap_info_struct *p, * It's possible scan_swap_map() uses a free cluster in the middle of free * cluster list. Avoiding such abuse to avoid list corruption. */ -static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, unsigned long offset) { + struct percpu_cluster *percpu_cluster; + bool conflict; + offset /= SWAPFILE_CLUSTER; - return !cluster_is_null(&si->free_cluster_head) && + conflict = !cluster_is_null(&si->free_cluster_head) && offset != cluster_next(&si->free_cluster_head) && cluster_is_free(&si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + cluster_set_null(&percpu_cluster->index); + return true; +} + +/* + * Try to get a swap entry from current cpu's swap entry pool (a cluster). This + * might involve allocating a new cluster for current CPU too. + */ +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster_is_null(&cluster->index)) { + if (!cluster_is_null(&si->free_cluster_head)) { + cluster->index = si->free_cluster_head; + cluster->next = cluster_next(&cluster->index) * + SWAPFILE_CLUSTER; + } else if (!cluster_is_null(&si->discard_cluster_head)) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + *scan_base = *offset = si->cluster_next; + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster_next(&cluster->index) + 1) * + SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster_set_null(&cluster->index); + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; } static unsigned long scan_swap_map(struct swap_info_struct *si, @@ -423,41 +488,17 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } -check_cluster: - if (!cluster_is_null(&si->free_cluster_head)) { - offset = cluster_next(&si->free_cluster_head) * - SWAPFILE_CLUSTER; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } else if (si->cluster_info) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them - */ - if (!cluster_is_null(&si->discard_cluster_head)) { - si->cluster_nr = 0; - swap_do_scheduled_discard(si); - scan_base = offset = si->cluster_next; - if (!si->cluster_nr) - goto check_cluster; - si->cluster_nr--; - goto checks; - } - - /* - * Checking free cluster is fast enough, we can do the - * check every time - */ - si->cluster_nr = 0; - goto checks; - } spin_unlock(&si->lock); @@ -516,8 +557,10 @@ check_cluster: } checks: - if (scan_swap_map_recheck_cluster(si, offset)) - goto check_cluster; + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -1884,6 +1927,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); frontswap_invalidate_area(type); mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; vfree(swap_map); vfree(cluster_info); vfree(frontswap_map); @@ -2403,6 +2448,16 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } + for_each_possible_cpu(i) { + struct percpu_cluster *cluster; + cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster_set_null(&cluster->index); + } } error = swap_cgroup_swapon(p->type, maxpages); @@ -2475,6 +2530,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = 0; goto out; bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); From b8af29418a0269d2c12f563add54a95cc19471fb Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Wed, 11 Sep 2013 14:20:34 -0700 Subject: [PATCH 050/303] mm/page_alloc.c: fix coding style and spelling Fix all errors reported by checkpatch and some small spelling mistakes. Signed-off-by: Pintu Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cf157637df3..2ca3e9bd739c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -721,7 +721,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) return false; if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page),PAGE_SIZE<wait_table) return -ENOMEM; - for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) + for (i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); return 0; @@ -4930,7 +4931,7 @@ static unsigned long __init early_calculate_totalpages(void) if (pages) node_set_state(nid, N_MEMORY); } - return totalpages; + return totalpages; } /* @@ -5047,7 +5048,7 @@ restart: /* * Some kernelcore has been met, update counts and * break if the kernelcore for this node has been - * satisified + * satisfied */ required_kernelcore -= min(required_kernelcore, size_pages); @@ -5061,7 +5062,7 @@ restart: * If there is still required_kernelcore, we do another pass with one * less node in the count. This will push zone_movable_pfn[nid] further * along on the nodes that still have memory until kernelcore is - * satisified + * satisfied */ usable_nodes--; if (usable_nodes && required_kernelcore > usable_nodes) @@ -5286,8 +5287,10 @@ void __init mem_init_print_info(const char *str) * 3) .rodata.* may be embedded into .text or .data sections. */ #define adj_init_size(start, end, size, pos, adj) \ - if (start <= pos && pos < end && size > adj) \ - size -= adj; + do { \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; \ + } while (0) adj_init_size(__init_begin, __init_end, init_data_size, _sinittext, init_code_size); @@ -5570,7 +5573,7 @@ static void __meminit setup_per_zone_inactive_ratio(void) * we want it large (64MB max). But it is not linear, because network * bandwidth does not increase linearly with machine size. We use * - * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: + * min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy: * min_free_kbytes = sqrt(lowmem_kbytes * 16) * * which yields @@ -5614,11 +5617,11 @@ int __meminit init_per_zone_wmark_min(void) module_init(init_per_zone_wmark_min) /* - * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so + * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * that we can call two helper functions whenever min_free_kbytes * changes. */ -int min_free_kbytes_sysctl_handler(ctl_table *table, int write, +int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); @@ -5682,8 +5685,8 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, /* * percpu_pagelist_fraction - changes the pcp->high for each zone on each - * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist - * can have before it gets flushed back to buddy allocator. + * cpu. It is the fraction of total pages in each zone that a hot per cpu + * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) @@ -5901,7 +5904,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages * - * PageLRU check wihtout isolation or lru_lock could race so that + * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ From fef903efcf0cb9721f3f2da719daec9bbc26f12b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 11 Sep 2013 14:20:35 -0700 Subject: [PATCH 051/303] mm/page_allo.c: restructure free-page stealing code and fix a bug The free-page stealing code in __rmqueue_fallback() is somewhat hard to follow, and has an incredible amount of subtlety hidden inside! First off, there is a minor bug in the reporting of change-of-ownership of pageblocks. Under some conditions, we try to move upto 'pageblock_nr_pages' no. of pages to the preferred allocation list. But we change the ownership of that pageblock to the preferred type only if we manage to successfully move atleast half of that pageblock (or if page_group_by_mobility_disabled is set). However, the current code ignores the latter part and sets the 'migratetype' variable to the preferred type, irrespective of whether we actually changed the pageblock migratetype of that block or not. So, the page_alloc_extfrag tracepoint can end up printing incorrect info (i.e., 'change_ownership' might be shown as 1 when it must have been 0). So fixing this involves moving the update of the 'migratetype' variable to the right place. But looking closer, we observe that the 'migratetype' variable is used subsequently for checks such as "is_migrate_cma()". Obviously the intent there is to check if the *fallback* type is MIGRATE_CMA, but since we already set the 'migratetype' variable to start_migratetype, we end up checking if the *preferred* type is MIGRATE_CMA!! To make things more interesting, this actually doesn't cause a bug in practice, because we never change *anything* if the fallback type is CMA. So, restructure the code in such a way that it is trivial to understand what is going on, and also fix the above mentioned bug. And while at it, also add a comment explaining the subtlety behind the migratetype used in the call to expand(). [akpm@linux-foundation.org: remove unneeded `inline', small coding-style fix] Signed-off-by: Srivatsa S. Bhat Cc: Mel Gorman Cc: Minchan Kim Cc: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 95 ++++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 36 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2ca3e9bd739c..b09ce5fe0cd2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1008,6 +1008,52 @@ static void change_pageblock_range(struct page *pageblock_page, } } +/* + * If breaking a large block of pages, move all free pages to the preferred + * allocation list. If falling back for a reclaimable kernel allocation, be + * more aggressive about taking ownership of free pages. + * + * On the other hand, never change migration type of MIGRATE_CMA pageblocks + * nor move CMA pages to different free lists. We don't want unmovable pages + * to be allocated from MIGRATE_CMA areas. + * + * Returns the new migratetype of the pageblock (or the same old migratetype + * if it was unchanged). + */ +static int try_to_steal_freepages(struct zone *zone, struct page *page, + int start_type, int fallback_type) +{ + int current_order = page_order(page); + + if (is_migrate_cma(fallback_type)) + return fallback_type; + + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) { + change_pageblock_range(page, current_order, start_type); + return start_type; + } + + if (current_order >= pageblock_order / 2 || + start_type == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { + int pages; + + pages = move_freepages_block(zone, page, start_type); + + /* Claim the whole block if over half of it is free */ + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) { + + set_pageblock_migratetype(page, start_type); + return start_type; + } + + } + + return fallback_type; +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -1015,7 +1061,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) struct free_area *area; int current_order; struct page *page; - int migratetype, i; + int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ for (current_order = MAX_ORDER-1; current_order >= order; @@ -1035,51 +1081,28 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) struct page, lru); area->nr_free--; - /* - * If breaking a large block of pages, move all free - * pages to the preferred allocation list. If falling - * back for a reclaimable kernel allocation, be more - * aggressive about taking ownership of free pages - * - * On the other hand, never change migration - * type of MIGRATE_CMA pageblocks nor move CMA - * pages on different free lists. We don't - * want unmovable pages to be allocated from - * MIGRATE_CMA areas. - */ - if (!is_migrate_cma(migratetype) && - (current_order >= pageblock_order / 2 || - start_migratetype == MIGRATE_RECLAIMABLE || - page_group_by_mobility_disabled)) { - int pages; - pages = move_freepages_block(zone, page, - start_migratetype); - - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || - page_group_by_mobility_disabled) - set_pageblock_migratetype(page, - start_migratetype); - - migratetype = start_migratetype; - } + new_type = try_to_steal_freepages(zone, page, + start_migratetype, + migratetype); /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - /* Take ownership for orders >= pageblock_order */ - if (current_order >= pageblock_order && - !is_migrate_cma(migratetype)) - change_pageblock_range(page, current_order, - start_migratetype); - + /* + * Borrow the excess buddy pages as well, irrespective + * of whether we stole freepages, or took ownership of + * the pageblock or not. + * + * Exception: When borrowing from MIGRATE_CMA, release + * the excess buddy pages to CMA itself. + */ expand(zone, page, order, current_order, area, is_migrate_cma(migratetype) ? migratetype : start_migratetype); trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype); + start_migratetype, new_type); return page; } From f92310c1877fc73470bdcd9228758fa3713c191b Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Wed, 11 Sep 2013 14:20:36 -0700 Subject: [PATCH 052/303] mm/page_alloc.c: fix the value of fallback_migratetype in alloc_extfrag tracepoint() In the current code, the value of fallback_migratetype that is printed using the mm_page_alloc_extfrag tracepoint, is the value of the migratetype *after* it has been set to the preferred migratetype (if the ownership was changed). Obviously that wouldn't have been the original intent. (We already have a separate 'change_ownership' field to tell whether the ownership of the pageblock was changed from the fallback_migratetype to the preferred type.) The intent of the fallback_migratetype field is to show the migratetype from which we borrowed pages in order to satisfy the allocation request. So fix the code to print that value correctly. Signed-off-by: Srivatsa S. Bhat Cc: Mel Gorman Cc: Minchan Kim Cc: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 10 +++++++--- mm/page_alloc.c | 5 +++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 6bc943ecb841..d0c613476620 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -268,11 +268,13 @@ TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, int alloc_order, int fallback_order, - int alloc_migratetype, int fallback_migratetype), + int alloc_migratetype, int fallback_migratetype, + int change_ownership), TP_ARGS(page, alloc_order, fallback_order, - alloc_migratetype, fallback_migratetype), + alloc_migratetype, fallback_migratetype, + change_ownership), TP_STRUCT__entry( __field( struct page *, page ) @@ -280,6 +282,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __field( int, fallback_order ) __field( int, alloc_migratetype ) __field( int, fallback_migratetype ) + __field( int, change_ownership ) ), TP_fast_assign( @@ -288,6 +291,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; + __entry->change_ownership = change_ownership; ), TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", @@ -299,7 +303,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->alloc_migratetype, __entry->fallback_migratetype, __entry->fallback_order < pageblock_order, - __entry->alloc_migratetype == __entry->fallback_migratetype) + __entry->change_ownership) ); #endif /* _TRACE_KMEM_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b09ce5fe0cd2..2748fc6a9003 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1101,8 +1101,9 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) is_migrate_cma(migratetype) ? migratetype : start_migratetype); - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, new_type); + trace_mm_page_alloc_extfrag(page, order, + current_order, start_migratetype, migratetype, + new_type == start_migratetype); return page; } From e2d0bd2b924d74d5e0d4f395f8f4730d125e198c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Sep 2013 14:20:37 -0700 Subject: [PATCH 053/303] mm: kill one if loop in __free_pages_bootmem() We should not check loop+1 with loop end in loop body. Just duplicate two lines code to avoid it. That will help a bit when we have huge amount of pages on system with 16TiB memory. Signed-off-by: Yinghai Lu Cc: Mel Gorman Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2748fc6a9003..8c68ef13cefa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -751,19 +751,19 @@ static void __free_pages_ok(struct page *page, unsigned int order) void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; + struct page *p = page; unsigned int loop; - prefetchw(page); - for (loop = 0; loop < nr_pages; loop++) { - struct page *p = &page[loop]; - - if (loop + 1 < nr_pages) - prefetchw(p + 1); + prefetchw(p); + for (loop = 0; loop < (nr_pages - 1); loop++, p++) { + prefetchw(p + 1); __ClearPageReserved(p); set_page_count(p, 0); } + __ClearPageReserved(p); + set_page_count(p, 0); - page_zone(page)->managed_pages += 1 << order; + page_zone(page)->managed_pages += nr_pages; set_page_refcounted(page); __free_pages(page, order); } From a8f531ebc33052642b4bd7b812eedf397108ce64 Mon Sep 17 00:00:00 2001 From: Libin Date: Wed, 11 Sep 2013 14:20:38 -0700 Subject: [PATCH 054/303] mm/huge_memory.c: fix potential NULL pointer dereference In collapse_huge_page() there is a race window between releasing the mmap_sem read lock and taking the mmap_sem write lock, so find_vma() may return NULL. So check the return value to avoid NULL pointer dereference. collapse_huge_page khugepaged_alloc_page up_read(&mm->mmap_sem) down_write(&mm->mmap_sem) vma = find_vma(mm, address) Signed-off-by: Libin Acked-by: Kirill A. Shutemov Reviewed-by: Wanpeng Li Reviewed-by: Michal Hocko Cc: # v3.0+ Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8b7fc2025e04..963e14c0486f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2296,6 +2296,8 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; vma = find_vma(mm, address); + if (!vma) + goto out; hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) From 892f795df1eb119b560a3ee5a1ca3f385a852e84 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Sep 2013 14:20:39 -0700 Subject: [PATCH 055/303] mm: vmscan: fix numa reclaim balance problem in kswapd The way the page allocator interacts with kswapd creates aging imbalances, where the amount of time a userspace page gets in memory under reclaim pressure is dependent on which zone, which node the allocator took the page frame from. #1 fixes missed kswapd wakeups on NUMA systems, which lead to some nodes falling behind for a full reclaim cycle relative to the other nodes in the system #3 fixes an interaction where kswapd and a continuous stream of page allocations keep the preferred zone of a task between the high and low watermark (allocations succeed + kswapd does not go to sleep) indefinitely, completely underutilizing the lower zones and thrashing on the preferred zone These patches are the aging fairness part of the thrash-detection based file LRU balancing. Andrea recommended to submit them separately as they are bugfixes in their own right. The following test ran a foreground workload (memcachetest) with background IO of various sizes on a 4 node 8G system (similar results were observed with single-node 4G systems): parallelio BAS FAIRALLO BASE FAIRALLOC Ops memcachetest-0M 5170.00 ( 0.00%) 5283.00 ( 2.19%) Ops memcachetest-791M 4740.00 ( 0.00%) 5293.00 ( 11.67%) Ops memcachetest-2639M 2551.00 ( 0.00%) 4950.00 ( 94.04%) Ops memcachetest-4487M 2606.00 ( 0.00%) 3922.00 ( 50.50%) Ops io-duration-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops io-duration-791M 55.00 ( 0.00%) 18.00 ( 67.27%) Ops io-duration-2639M 235.00 ( 0.00%) 103.00 ( 56.17%) Ops io-duration-4487M 278.00 ( 0.00%) 173.00 ( 37.77%) Ops swaptotal-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-791M 245184.00 ( 0.00%) 0.00 ( 0.00%) Ops swaptotal-2639M 468069.00 ( 0.00%) 108778.00 ( 76.76%) Ops swaptotal-4487M 452529.00 ( 0.00%) 76623.00 ( 83.07%) Ops swapin-0M 0.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-791M 108297.00 ( 0.00%) 0.00 ( 0.00%) Ops swapin-2639M 169537.00 ( 0.00%) 50031.00 ( 70.49%) Ops swapin-4487M 167435.00 ( 0.00%) 34178.00 ( 79.59%) Ops minorfaults-0M 1518666.00 ( 0.00%) 1503993.00 ( 0.97%) Ops minorfaults-791M 1676963.00 ( 0.00%) 1520115.00 ( 9.35%) Ops minorfaults-2639M 1606035.00 ( 0.00%) 1799717.00 (-12.06%) Ops minorfaults-4487M 1612118.00 ( 0.00%) 1583825.00 ( 1.76%) Ops majorfaults-0M 6.00 ( 0.00%) 0.00 ( 0.00%) Ops majorfaults-791M 13836.00 ( 0.00%) 10.00 ( 99.93%) Ops majorfaults-2639M 22307.00 ( 0.00%) 6490.00 ( 70.91%) Ops majorfaults-4487M 21631.00 ( 0.00%) 4380.00 ( 79.75%) BAS FAIRALLO BASE FAIRALLOC User 287.78 460.97 System 2151.67 3142.51 Elapsed 9737.00 8879.34 BAS FAIRALLO BASE FAIRALLOC Minor Faults 53721925 57188551 Major Faults 392195 15157 Swap Ins 2994854 112770 Swap Outs 4907092 134982 Direct pages scanned 0 41824 Kswapd pages scanned 32975063 8128269 Kswapd pages reclaimed 6323069 7093495 Direct pages reclaimed 0 41824 Kswapd efficiency 19% 87% Kswapd velocity 3386.573 915.414 Direct efficiency 100% 100% Direct velocity 0.000 4.710 Percentage direct scans 0% 0% Zone normal velocity 2011.338 550.661 Zone dma32 velocity 1365.623 369.221 Zone dma velocity 9.612 0.242 Page writes by reclaim 18732404.000 614807.000 Page writes file 13825312 479825 Page writes anon 4907092 134982 Page reclaim immediate 85490 5647 Sector Reads 12080532 483244 Sector Writes 88740508 65438876 Page rescued immediate 0 0 Slabs scanned 82560 12160 Direct inode steals 0 0 Kswapd inode steals 24401 40013 Kswapd skipped wait 0 0 THP fault alloc 6 8 THP collapse alloc 5481 5812 THP splits 75 22 THP fault fallback 0 0 THP collapse fail 0 0 Compaction stalls 0 54 Compaction success 0 45 Compaction failures 0 9 Page migrate success 881492 82278 Page migrate failure 0 0 Compaction pages isolated 0 60334 Compaction migrate scanned 0 53505 Compaction free scanned 0 1537605 Compaction cost 914 86 NUMA PTE updates 46738231 41988419 NUMA hint faults 31175564 24213387 NUMA hint local faults 10427393 6411593 NUMA pages migrated 881492 55344 AutoNUMA cost 156221 121361 The overall runtime was reduced, throughput for both the foreground workload as well as the background IO improved, major faults, swapping and reclaim activity shrunk significantly, reclaim efficiency more than quadrupled. This patch: When the page allocator fails to get a page from all zones in its given zonelist, it wakes up the per-node kswapds for all zones that are at their low watermark. However, with a system under load the free pages in a zone can fluctuate enough that the allocation fails but the kswapd wakeup is also skipped while the zone is still really close to the low watermark. When one node misses a wakeup like this, it won't be aged before all the other node's zones are down to their low watermarks again. And skipping a full aging cycle is an obvious fairness problem. Kswapd runs until the high watermarks are restored, so it should also be woken when the high watermarks are not met. This ages nodes more equally and creates a safety margin for the page counter fluctuation. By using zone_balanced(), it will now check, in addition to the watermark, if compaction requires more order-0 pages to create a higher order page. Signed-off-by: Johannes Weiner Cc: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Paul Bolle Tested-by: Zlatko Calusic Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2cff0d491c6d..758540d3ca83 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3237,7 +3237,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); From e085dbc52fad8d79fa2245339c84bf3ef0b3a802 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Sep 2013 14:20:46 -0700 Subject: [PATCH 056/303] mm: page_alloc: rearrange watermark checking in get_page_from_freelist Allocations that do not have to respect the watermarks are rare high-priority events. Reorder the code such that per-zone dirty limits and future checks important only to regular page allocations are ignored in these extraordinary situations. Signed-off-by: Johannes Weiner Cc: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Paul Bolle Tested-by: Zlatko Calusic Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8c68ef13cefa..9884aa0f233a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1889,12 +1889,17 @@ zonelist_scan: */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { + unsigned long mark; + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty @@ -1925,16 +1930,11 @@ zonelist_scan: (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) goto this_zone_full; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - unsigned long mark; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) { int ret; - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) - goto try_this_zone; - if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* From 81c0a2bb515fd4daae8cab64352877480792b515 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Sep 2013 14:20:47 -0700 Subject: [PATCH 057/303] mm: page_alloc: fair zone allocator policy Each zone that holds userspace pages of one workload must be aged at a speed proportional to the zone size. Otherwise, the time an individual page gets to stay in memory depends on the zone it happened to be allocated in. Asymmetry in the zone aging creates rather unpredictable aging behavior and results in the wrong pages being reclaimed, activated etc. But exactly this happens right now because of the way the page allocator and kswapd interact. The page allocator uses per-node lists of all zones in the system, ordered by preference, when allocating a new page. When the first iteration does not yield any results, kswapd is woken up and the allocator retries. Due to the way kswapd reclaims zones below the high watermark while a zone can be allocated from when it is above the low watermark, the allocator may keep kswapd running while kswapd reclaim ensures that the page allocator can keep allocating from the first zone in the zonelist for extended periods of time. Meanwhile the other zones rarely see new allocations and thus get aged much slower in comparison. The result is that the occasional page placed in lower zones gets relatively more time in memory, even gets promoted to the active list after its peers have long been evicted. Meanwhile, the bulk of the working set may be thrashing on the preferred zone even though there may be significant amounts of memory available in the lower zones. Even the most basic test -- repeatedly reading a file slightly bigger than memory -- shows how broken the zone aging is. In this scenario, no single page should be able stay in memory long enough to get referenced twice and activated, but activation happens in spades: $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 0 nr_active_file 8 nr_inactive_file 1582 nr_active_file 11994 $ cat data data data data >/dev/null $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 70 nr_inactive_file 258753 nr_active_file 443214 nr_inactive_file 149793 nr_active_file 12021 Fix this with a very simple round robin allocator. Each zone is allowed a batch of allocations that is proportional to the zone's size, after which it is treated as full. The batch counters are reset when all zones have been tried and the allocator enters the slowpath and kicks off kswapd reclaim. Allocation and reclaim is now fairly spread out to all available/allowable zones: $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 174 nr_active_file 4865 nr_inactive_file 53 nr_active_file 860 $ cat data data data data >/dev/null $ grep active_file /proc/zoneinfo nr_inactive_file 0 nr_active_file 0 nr_inactive_file 666622 nr_active_file 4988 nr_inactive_file 190969 nr_active_file 937 When zone_reclaim_mode is enabled, allocations will now spread out to all zones on the local node, not just the first preferred zone (which on a 4G node might be a tiny Normal zone). Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Paul Bolle Cc: Zlatko Calusic Tested-by: Kevin Hilman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 1 + mm/page_alloc.c | 72 ++++++++++++++++++++++++++++++++++++------ mm/vmstat.c | 1 + 3 files changed, 64 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index af4a3b77a8de..ac1ea796ec0f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -105,6 +105,7 @@ struct zone_padding { enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, + NR_ALLOC_BATCH, NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9884aa0f233a..544d19d681a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1551,6 +1551,7 @@ again: get_pageblock_migratetype(page)); } + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); local_irq_restore(flags); @@ -1817,6 +1818,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return node_distance(local_zone->node, zone->node) == LOCAL_DISTANCE; +} + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); @@ -1854,6 +1860,11 @@ static void zlc_clear_zones_full(struct zonelist *zonelist) { } +static bool zone_local(struct zone *local_zone, struct zone *zone) +{ + return true; +} + static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return true; @@ -1900,6 +1911,26 @@ zonelist_scan: BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (alloc_flags & ALLOC_NO_WATERMARKS) goto try_this_zone; + /* + * Distribute pages in proportion to the individual + * zone size to ensure fair page aging. The zone a + * page was allocated in should have no effect on the + * time the page has in memory before being reclaimed. + * + * When zone_reclaim_mode is enabled, try to stay in + * local zones in the fastpath. If that fails, the + * slowpath is entered, which will do another pass + * starting with the local zones, but ultimately fall + * back to remote zones that do not partake in the + * fairness round-robin cycle of this zonelist. + */ + if (alloc_flags & ALLOC_WMARK_LOW) { + if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) + continue; + if (zone_reclaim_mode && + !zone_local(preferred_zone, zone)) + continue; + } /* * When allocating a page cache page for writing, we * want to get it from a zone that is within its dirty @@ -2346,16 +2377,30 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static inline -void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx, - enum zone_type classzone_idx) +static void prepare_slowpath(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, + enum zone_type high_zoneidx, + struct zone *preferred_zone) { struct zoneref *z; struct zone *zone; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order, classzone_idx); + for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wakeup_kswapd(zone, order, zone_idx(preferred_zone)); + /* + * Only reset the batches of zones that were actually + * considered in the fast path, we don't want to + * thrash fairness information for zones that are not + * actually part of this zonelist's round-robin cycle. + */ + if (zone_reclaim_mode && !zone_local(preferred_zone, zone)) + continue; + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + } } static inline int @@ -2451,9 +2496,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - if (!(gfp_mask & __GFP_NO_KSWAPD)) - wake_all_kswapd(order, zonelist, high_zoneidx, - zone_idx(preferred_zone)); + prepare_slowpath(gfp_mask, order, zonelist, + high_zoneidx, preferred_zone); /* * OK, we're below the kswapd watermark and have kicked background @@ -4753,8 +4797,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone_pcp_init(zone); + + /* For bootup, initialized properly in watermark setup */ + mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages); + lruvec_init(&zone->lruvec); if (!size) continue; @@ -5525,6 +5572,11 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); + __mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - + low_wmark_pages(zone) - + zone_page_state(zone, NR_ALLOC_BATCH)); + setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } diff --git a/mm/vmstat.c b/mm/vmstat.c index ca06e9653827..8a8da1f9b044 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -703,6 +703,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", + "nr_alloc_batch", "nr_inactive_anon", "nr_active_anon", "nr_inactive_file", From 72457c0a05ed06f978d3a8a7c9d5ad527db88b4c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 11 Sep 2013 14:20:49 -0700 Subject: [PATCH 058/303] mm: revert "page-writeback.c: subtract min_free_kbytes from dirtyable memory" This reverts commit 75f7ad8e043d. It was the result of a problem observed with a 3.2 kernel and merged in 3.9, while the issue had been resolved upstream in 3.3 (commit ab8fabd46f81: "mm: exclude reserved pages from dirtyable memory"). The "reserved pages" are a superset of min_free_kbytes, thus this change is redundant and confusing. Revert it. Signed-off-by: Johannes Weiner Cc: Paul Szabo Cc: Rik van Riel Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3f0c895c71fe..d374b29296dd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -241,9 +241,6 @@ static unsigned long global_dirtyable_memory(void) if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); - /* Subtract min_free_kbytes */ - x -= min_t(unsigned long, x, min_free_kbytes >> (PAGE_SHIFT - 10)); - return x + 1; /* Ensure that we never return 0 */ } From 9966c4bbb110003ee218c5c4df583041b57027c4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:20:50 -0700 Subject: [PATCH 059/303] mm, hugetlb: move up the code which check availability of free huge page In this time we are holding a hugetlb_lock, so hstate values can't be changed. If we don't have any usable free huge page in this time, we don't need to proceed with the processing. So move this code up. Signed-off-by: Joonsoo Kim Acked-by: Michal Hocko Reviewed-by: Wanpeng Li Reviewed-by: Aneesh Kumar K.V Cc: Aneesh Kumar K.V Acked-by: Hillf Danton Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6e514831bda5..a87903578810 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -539,10 +539,6 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct zoneref *z; unsigned int cpuset_mems_cookie; -retry_cpuset: - cpuset_mems_cookie = get_mems_allowed(); - zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -556,6 +552,11 @@ retry_cpuset: if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) goto err; +retry_cpuset: + cpuset_mems_cookie = get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); + for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { @@ -574,7 +575,6 @@ retry_cpuset: return page; err: - mpol_cond_put(mpol); return NULL; } From c748c26294600d7e4f0d1f2f61449d3a740f102f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:20:57 -0700 Subject: [PATCH 060/303] mm, hugetlb: trivial commenting fix The name of the mutex written in comment is wrong. Fix it. Signed-off-by: Joonsoo Kim Acked-by: Michal Hocko Acked-by: Hillf Danton Reviewed-by: Aneesh Kumar K.V Cc: Aneesh Kumar K.V Cc: Naoya Horiguchi Cc: Wanpeng Li Cc: Rik van Riel Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a87903578810..bfca1b00b09b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -135,9 +135,9 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) * across the pages in a mapping. * * The region data structures are protected by a combination of the mmap_sem - * and the hugetlb_instantion_mutex. To access or modify a region the caller + * and the hugetlb_instantiation_mutex. To access or modify a region the caller * must either hold the mmap_sem for write, or the mmap_sem for read and - * the hugetlb_instantiation mutex: + * the hugetlb_instantiation_mutex: * * down_write(&mm->mmap_sem); * or From 81a6fcae3ff3f6af1c9d7e31499e68fda2b3f58d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:20:58 -0700 Subject: [PATCH 061/303] mm, hugetlb: clean-up alloc_huge_page() Unify successful allocation paths to make the code more readable. There are no functional changes. Signed-off-by: Joonsoo Kim Acked-by: Michal Hocko Reviewed-by: Wanpeng Li Reviewed-by: Aneesh Kumar K.V Cc: Hillf Danton Cc: Naoya Horiguchi Cc: Wanpeng Li Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfca1b00b09b..a698d40d1c3e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1166,12 +1166,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - if (page) { - /* update page cgroup details */ - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), - h_cg, page); - spin_unlock(&hugetlb_lock); - } else { + if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { @@ -1182,11 +1177,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), - h_cg, page); list_move(&page->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); + /* Fall through */ } + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); + spin_unlock(&hugetlb_lock); set_page_private(page, (unsigned long)spool); From b2261026825ed34066b24069359d118098bb1876 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:00 -0700 Subject: [PATCH 062/303] mm, hugetlb: fix and clean-up node iteration code to alloc or free Current node iteration code have a minor problem which do one more node rotation if we can't succeed to allocate. For example, if we start to allocate at node 0, we stop to iterate at node 0. Then we start to allocate at node 1 for next allocation. I introduce new macros "for_each_node_mask_to_[alloc|free]" and fix and clean-up node iteration code to alloc or free. This makes code more understandable. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Acked-by: Hillf Danton Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Wanpeng Li Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 143 ++++++++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 82 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a698d40d1c3e..08b7595fe3c1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -772,33 +772,6 @@ static int hstate_next_node_to_alloc(struct hstate *h, return nid; } -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) -{ - struct page *page; - int start_nid; - int next_nid; - int ret = 0; - - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - next_nid = start_nid; - - do { - page = alloc_fresh_huge_page_node(h, next_nid); - if (page) { - ret = 1; - break; - } - next_nid = hstate_next_node_to_alloc(h, nodes_allowed); - } while (next_nid != start_nid); - - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); - - return ret; -} - /* * helper for free_pool_huge_page() - return the previously saved * node ["this node"] from which to free a huge page. Advance the @@ -817,6 +790,40 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) return nid; } +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +{ + struct page *page; + int nr_nodes, node; + int ret = 0; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_huge_page_node(h, node); + if (page) { + ret = 1; + break; + } + } + + if (ret) + count_vm_event(HTLB_BUDDY_PGALLOC); + else + count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + + return ret; +} + /* * Free huge page from pool from next node to free. * Attempt to keep persistent huge pages more or less @@ -826,36 +833,31 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, bool acct_surplus) { - int start_nid; - int next_nid; + int nr_nodes, node; int ret = 0; - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { /* * If we're returning unused surplus pages, only examine * nodes with surplus pages. */ - if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && - !list_empty(&h->hugepage_freelists[next_nid])) { + if ((!acct_surplus || h->surplus_huge_pages_node[node]) && + !list_empty(&h->hugepage_freelists[node])) { struct page *page = - list_entry(h->hugepage_freelists[next_nid].next, + list_entry(h->hugepage_freelists[node].next, struct page, lru); list_del(&page->lru); h->free_huge_pages--; - h->free_huge_pages_node[next_nid]--; + h->free_huge_pages_node[node]--; if (acct_surplus) { h->surplus_huge_pages--; - h->surplus_huge_pages_node[next_nid]--; + h->surplus_huge_pages_node[node]--; } update_and_free_page(h, page); ret = 1; break; } - next_nid = hstate_next_node_to_free(h, nodes_allowed); - } while (next_nid != start_nid); + } return ret; } @@ -1192,14 +1194,12 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_states[N_MEMORY]); + int nr_nodes, node; - while (nr_nodes) { + for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { void *addr; - addr = __alloc_bootmem_node_nopanic( - NODE_DATA(hstate_next_node_to_alloc(h, - &node_states[N_MEMORY])), + addr = __alloc_bootmem_node_nopanic(NODE_DATA(node), huge_page_size(h), huge_page_size(h), 0); if (addr) { @@ -1211,7 +1211,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - nr_nodes--; } return 0; @@ -1350,48 +1349,28 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count, static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, int delta) { - int start_nid, next_nid; - int ret = 0; + int nr_nodes, node; VM_BUG_ON(delta != -1 && delta != 1); - if (delta < 0) - start_nid = hstate_next_node_to_alloc(h, nodes_allowed); - else - start_nid = hstate_next_node_to_free(h, nodes_allowed); - next_nid = start_nid; - - do { - int nid = next_nid; - if (delta < 0) { - /* - * To shrink on this node, there must be a surplus page - */ - if (!h->surplus_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_alloc(h, - nodes_allowed); - continue; - } + if (delta < 0) { + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node]) + goto found; } - if (delta > 0) { - /* - * Surplus cannot exceed the total number of pages - */ - if (h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_free(h, - nodes_allowed); - continue; - } + } else { + for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) { + if (h->surplus_huge_pages_node[node] < + h->nr_huge_pages_node[node]) + goto found; } + } + return 0; - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (next_nid != start_nid); - - return ret; +found: + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[node] += delta; + return 1; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) From c0d934ba278935fa751057091fe4a7c02d814f68 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:02 -0700 Subject: [PATCH 063/303] mm, hugetlb: remove redundant list_empty check in gather_surplus_pages() If list is empty, list_for_each_entry_safe() doesn't do anything. So, this check is redundant. Remove it. Signed-off-by: Joonsoo Kim Acked-by: Michal Hocko Reviewed-by: Wanpeng Li Reviewed-by: Aneesh Kumar K.V Acked-by: Hillf Danton Cc: Naoya Horiguchi Cc: Wanpeng Li Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 08b7595fe3c1..a13be48b818b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1037,11 +1037,8 @@ free: spin_unlock(&hugetlb_lock); /* Free unnecessary surplus pages to the buddy allocator */ - if (!list_empty(&surplus_list)) { - list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - put_page(page); - } - } + list_for_each_entry_safe(page, tmp, &surplus_list, lru) + put_page(page); spin_lock(&hugetlb_lock); return ret; From 37a2140dc2145a6f154172286944a1861e978dfd Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:04 -0700 Subject: [PATCH 064/303] mm, hugetlb: do not use a page in page cache for cow optimization Currently, we use a page with mapped count 1 in page cache for cow optimization. If we find this condition, we don't allocate a new page and copy contents. Instead, we map this page directly. This may introduce a problem that writting to private mapping overwrite hugetlb file directly. You can find this situation with following code. size = 20 * MB; flag = MAP_SHARED; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); return -1; } p[0] = 's'; fprintf(stdout, "BEFORE STEAL PRIVATE WRITE: %c\n", p[0]); munmap(p, size); flag = MAP_PRIVATE; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); } p[0] = 'c'; munmap(p, size); flag = MAP_SHARED; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); return -1; } fprintf(stdout, "AFTER STEAL PRIVATE WRITE: %c\n", p[0]); munmap(p, size); We can see that "AFTER STEAL PRIVATE WRITE: c", not "AFTER STEAL PRIVATE WRITE: s". If we turn off this optimization to a page in page cache, the problem is disappeared. So, I change the trigger condition of optimization. If this page is not AnonPage, we don't do optimization. This makes this optimization turning off for a page cache. Signed-off-by: Joonsoo Kim Acked-by: Michal Hocko Reviewed-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Aneesh Kumar K.V Acked-by: Hillf Danton Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a13be48b818b..da027a3307af 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2528,7 +2528,6 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int avoidcopy; int outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2538,10 +2537,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_mapcount(old_page) == 1); - if (avoidcopy) { - if (PageAnon(old_page)) - page_move_anon_rmap(old_page, vma, address); + if (page_mapcount(old_page) == 1 && PageAnon(old_page)) { + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } From 72231b03ccf126ca04fba8359998ef7dfd195577 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:06 -0700 Subject: [PATCH 065/303] mm, hugetlb: add VM_NORESERVE check in vma_has_reserves() If we map the region with MAP_NORESERVE and MAP_SHARED, we can skip to check reserve counting and eventually we cannot be ensured to allocate a huge page in fault time. With following example code, you can easily find this situation. Assume 2MB, nr_hugepages = 100 fd = hugetlbfs_unlinked_fd(); if (fd < 0) return 1; size = 200 * MB; flag = MAP_SHARED; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); return -1; } size = 2 * MB; flag = MAP_ANONYMOUS | MAP_SHARED | MAP_HUGETLB | MAP_NORESERVE; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, -1, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); } p[0] = '0'; sleep(10); During executing sleep(10), run 'cat /proc/meminfo' on another process. HugePages_Free: 99 HugePages_Rsvd: 100 Number of free should be higher or equal than number of reserve, but this aren't. This represent that non reserved shared mapping steal a reserved page. Non reserved shared mapping should not eat into reserve space. If we consider VM_NORESERVE in vma_has_reserve() and return 0 which mean that we don't have reserved pages, then we check that we have enough free pages in dequeue_huge_page_vma(). This prevent to steal a reserved page. With this change, above test generate a SIGBUG which is correct, because all free pages are reserved and non reserved shared mapping can't get a free page. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Reviewed-by: Aneesh Kumar K.V Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index da027a3307af..cb134e6a9fee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -464,6 +464,8 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { + if (vma->vm_flags & VM_NORESERVE) + return 0; if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) From a63884e921cb33a6beb260fa88bcbf1712d98a9a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:07 -0700 Subject: [PATCH 066/303] mm, hugetlb: remove decrement_hugepage_resv_vma() Now, Checking condition of decrement_hugepage_resv_vma() and vma_has_reserves() is same, so we can clean-up this function with vma_has_reserves(). Additionally, decrement_hugepage_resv_vma() has only one call site, so we can remove function and embed it into dequeue_huge_page_vma() directly. This patch implement it. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Reviewed-by: Aneesh Kumar K.V Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cb134e6a9fee..dacf0d2256d9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -434,25 +434,6 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Decrement the reserved pages in the hugepage pool by one */ -static void decrement_hugepage_resv_vma(struct hstate *h, - struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_NORESERVE) - return; - - if (vma->vm_flags & VM_MAYSHARE) { - /* Shared mappings always use reserves */ - h->resv_huge_pages--; - } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { - /* - * Only the process that called mmap() has reserves for - * private mappings. - */ - h->resv_huge_pages--; - } -} - /* Reset counters to 0 and clear all HPAGE_RESV_* flags */ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { @@ -466,10 +447,18 @@ static int vma_has_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_NORESERVE) return 0; + + /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) return 1; + + /* + * Only the process that called mmap() has reserves for + * private mappings. + */ if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; + return 0; } @@ -564,8 +553,8 @@ retry_cpuset: if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); + if (!avoid_reserve && vma_has_reserves(vma)) + h->resv_huge_pages--; break; } } From af0ed73e699bb0453603b1d1a4727377641b2096 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:18 -0700 Subject: [PATCH 067/303] mm, hugetlb: decrement reserve count if VM_NORESERVE alloc page cache If a vma with VM_NORESERVE allocate a new page for page cache, we should check whether this area is reserved or not. If this address is already reserved by other process(in case of chg == 0), we should decrement reserve count, because this allocated page will go into page cache and currently, there is no way to know that this page comes from reserved pool or not when releasing inode. This may introduce over-counting problem to reserved count. With following example code, you can easily reproduce this situation. Assume 2MB, nr_hugepages = 100 size = 20 * MB; flag = MAP_SHARED; p = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (p == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); return -1; } flag = MAP_SHARED | MAP_NORESERVE; q = mmap(NULL, size, PROT_READ|PROT_WRITE, flag, fd, 0); if (q == MAP_FAILED) { fprintf(stderr, "mmap() failed: %s\n", strerror(errno)); } q[0] = 'c'; After finish the program, run 'cat /proc/meminfo'. You can see below result. HugePages_Free: 100 HugePages_Rsvd: 1 To fix this, we should check our mapping type and tracked region. If our mapping is VM_NORESERVE, VM_MAYSHARE and chg is 0, this imply that current allocated page will go into page cache which is already reserved region when mapping is created. In this case, we should decrease reserve count. As implementing above, this patch solve the problem. [akpm@linux-foundation.org: fix spelling in comment] Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Reviewed-by: Aneesh Kumar K.V Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Mel Gorman Cc: "Aneesh Kumar K.V" Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Davidlohr Bueso Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dacf0d2256d9..5b084c7b34c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -443,10 +443,23 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma, long chg) { - if (vma->vm_flags & VM_NORESERVE) - return 0; + if (vma->vm_flags & VM_NORESERVE) { + /* + * This address is already reserved by other process(chg == 0), + * so, we should decrement reserved count. Without decrementing, + * reserve count remains after releasing inode, because this + * allocated page will go into page cache and is regarded as + * coming from reserved pool in releasing step. Currently, we + * don't have any other solution to deal with this situation + * properly, so add work-around here. + */ + if (vma->vm_flags & VM_MAYSHARE && chg == 0) + return 1; + else + return 0; + } /* Shared mappings always use reserves */ if (vma->vm_flags & VM_MAYSHARE) @@ -520,7 +533,8 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address, int avoid_reserve) + unsigned long address, int avoid_reserve, + long chg) { struct page *page = NULL; struct mempolicy *mpol; @@ -535,7 +549,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma) && + if (!vma_has_reserves(vma, chg) && h->free_huge_pages - h->resv_huge_pages == 0) goto err; @@ -553,8 +567,12 @@ retry_cpuset: if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { - if (!avoid_reserve && vma_has_reserves(vma)) - h->resv_huge_pages--; + if (avoid_reserve) + break; + if (!vma_has_reserves(vma, chg)) + break; + + h->resv_huge_pages--; break; } } @@ -1155,7 +1173,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); - page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); + page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); From 1da6f0e1b316d0215989fe4d7c657edead1fdea7 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Wed, 11 Sep 2013 14:21:25 -0700 Subject: [PATCH 068/303] mm/mempolicy: return NULL if node is NUMA_NO_NODE in get_task_policy If node == NUMA_NO_NODE, pol is NULL, we should return NULL instead of do "if (!pol->mode)" check. [akpm@linux-foundation.org: reorganise code] Signed-off-by: Jianguo Wu Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Hugh Dickins Cc: Hanjun Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6b1d426731ae..27022ca890f8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -123,16 +123,19 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; static struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; - int node; if (!pol) { - node = numa_node_id(); - if (node != NUMA_NO_NODE) - pol = &preferred_node_policy[node]; + int node = numa_node_id(); - /* preferred_node_policy is not initialised early in boot */ - if (!pol->mode) - pol = NULL; + if (node != NUMA_NO_NODE) { + pol = &preferred_node_policy[node]; + /* + * preferred_node_policy is not initialised early in + * boot + */ + if (!pol->mode) + pol = NULL; + } } return pol; From e66f09725771ac5b1b868d6b19eba84e30ffad88 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:26 -0700 Subject: [PATCH 069/303] mm, page_alloc: add unlikely macro to help compiler optimization We rarely allocate a page with ALLOC_NO_WATERMARKS and it is used in slow path. For helping compiler optimization, add unlikely macro to ALLOC_NO_WATERMARKS checking. This patch doesn't have any effect now, because gcc already optimize this properly. But we cannot assume that gcc always does right and nobody re-evaluate if gcc do proper optimization with their change, for example, it is not optimized properly on v3.10. So adding compiler hint here is reasonable. Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 544d19d681a2..42c59300bacd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1909,7 +1909,7 @@ zonelist_scan: !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (alloc_flags & ALLOC_NO_WATERMARKS) + if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) goto try_this_zone; /* * Distribute pages in proportion to the individual From bc4b4448dba660afc8df3790564320302d9709a1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:28 -0700 Subject: [PATCH 070/303] mm: move pgtable related functions to right place pgtable related functions are mostly in pgtable-generic.c. So move remaining functions from memory.c to pgtable-generic.c. Signed-off-by: Joonsoo Kim Cc: Johannes Weiner Cc: Minchan Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 24 ------------------------ mm/pgtable-generic.c | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b3c6bf9a398e..c1c6d59b2b03 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -372,30 +372,6 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ -/* - * If a p?d_bad entry is found while walking page tables, report - * the error, before resetting entry to p?d_none. Usually (but - * very seldom) called out from the p?d_none_or_clear_bad macros. - */ - -void pgd_clear_bad(pgd_t *pgd) -{ - pgd_ERROR(*pgd); - pgd_clear(pgd); -} - -void pud_clear_bad(pud_t *pud) -{ - pud_ERROR(*pud); - pud_clear(pud); -} - -void pmd_clear_bad(pmd_t *pmd) -{ - pmd_ERROR(*pmd); - pmd_clear(pmd); -} - /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index e1a6e4fab016..3929a40bd6c0 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,30 @@ #include #include +/* + * If a p?d_bad entry is found while walking page tables, report + * the error, before resetting entry to p?d_none. Usually (but + * very seldom) called out from the p?d_none_or_clear_bad macros. + */ + +void pgd_clear_bad(pgd_t *pgd) +{ + pgd_ERROR(*pgd); + pgd_clear(pgd); +} + +void pud_clear_bad(pud_t *pud) +{ + pud_ERROR(*pud); + pud_clear(pud); +} + +void pmd_clear_bad(pmd_t *pmd) +{ + pmd_ERROR(*pmd); + pmd_clear(pmd); +} + #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS /* * Only sets the access flags (dirty, accessed), as well as write From d2cf5ad6312ca9913464fac40fb47ba47ad945c4 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:29 -0700 Subject: [PATCH 071/303] swap: clean-up #ifdef in page_mapping() PageSwapCache() is always false when !CONFIG_SWAP, so compiler properly discard related code. Therefore, we don't need #ifdef explicitly. Signed-off-by: Joonsoo Kim Acked-by: Johannes Weiner Cc: Minchan Kim Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/util.c | 5 +---- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 24db9142e93b..c03c139219c9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,6 +447,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #else /* CONFIG_SWAP */ +#define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL diff --git a/mm/util.c b/mm/util.c index 7441c41d00f6..eaf63fc2c92f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -388,15 +388,12 @@ struct address_space *page_mapping(struct page *page) struct address_space *mapping = page->mapping; VM_BUG_ON(PageSlab(page)); -#ifdef CONFIG_SWAP if (unlikely(PageSwapCache(page))) { swp_entry_t entry; entry.val = page_private(page); mapping = swap_address_space(entry); - } else -#endif - if ((unsigned long)mapping & PAGE_MAPPING_ANON) + } else if ((unsigned long)mapping & PAGE_MAPPING_ANON) mapping = NULL; return mapping; } From 2bb921e526656556e68f99f5f15a4a1bf2691844 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 11 Sep 2013 14:21:30 -0700 Subject: [PATCH 072/303] vmstat: create separate function to fold per cpu diffs into local counters The main idea behind this patchset is to reduce the vmstat update overhead by avoiding interrupt enable/disable and the use of per cpu atomics. This patch (of 3): It is better to have a separate folding function because refresh_cpu_vm_stats() also does other things like expire pages in the page allocator caches. If we have a separate function then refresh_cpu_vm_stats() is only called from the local cpu which allows additional optimizations. The folding function is only called when a cpu is being downed and therefore no other processor will be accessing the counters. Also simplifies synchronization. [akpm@linux-foundation.org: fix UP build] Signed-off-by: Christoph Lameter Cc: KOSAKI Motohiro CC: Tejun Heo Cc: Joonsoo Kim Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 3 ++- mm/page_alloc.c | 2 +- mm/vmstat.c | 40 ++++++++++++++++++++++++++++++++++------ 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index c586679b6fef..502767f4e4d4 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -198,7 +198,7 @@ extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); -void refresh_cpu_vm_stats(int); +void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); @@ -255,6 +255,7 @@ static inline void __dec_zone_page_state(struct page *page, static inline void refresh_cpu_vm_stats(int cpu) { } static inline void refresh_zone_stat_thresholds(void) { } +static inline void cpu_vm_stats_fold(int cpu) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 42c59300bacd..f885eb827159 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5435,7 +5435,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self, * This is only okay since the processor is dead and cannot * race with what we are doing. */ - refresh_cpu_vm_stats(cpu); + cpu_vm_stats_fold(cpu); } return NOTIFY_OK; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 8a8da1f9b044..aaee66330e01 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -415,11 +415,7 @@ EXPORT_SYMBOL(dec_zone_page_state); #endif /* - * Update the zone counters for one cpu. - * - * The cpu specified must be either the current cpu or a processor that - * is not online. If it is the current cpu then the execution thread must - * be pinned to the current cpu. + * Update the zone counters for the current cpu. * * Note that refresh_cpu_vm_stats strives to only access * node local memory. The per cpu pagesets on remote zones are placed @@ -432,7 +428,7 @@ EXPORT_SYMBOL(dec_zone_page_state); * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. */ -void refresh_cpu_vm_stats(int cpu) +static void refresh_cpu_vm_stats(int cpu) { struct zone *zone; int i; @@ -493,6 +489,38 @@ void refresh_cpu_vm_stats(int cpu) atomic_long_add(global_diff[i], &vm_stat[i]); } +/* + * Fold the data for an offline cpu into the global array. + * There cannot be any access by the offline cpu and therefore + * synchronization is simplified. + */ +void cpu_vm_stats_fold(int cpu) +{ + struct zone *zone; + int i; + int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p; + + p = per_cpu_ptr(zone->pageset, cpu); + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (p->vm_stat_diff[i]) { + int v; + + v = p->vm_stat_diff[i]; + p->vm_stat_diff[i] = 0; + atomic_long_add(v, &zone->vm_stat[i]); + global_diff[i] += v; + } + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (global_diff[i]) + atomic_long_add(global_diff[i], &vm_stat[i]); +} + /* * this is only called if !populated_zone(zone), which implies no other users of * pset->vm_stat_diff[] exsist. From 4edb0748b23887140578d68f5f4e6e2de337a481 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 11 Sep 2013 14:21:31 -0700 Subject: [PATCH 073/303] vmstat: create fold_diff Both functions that update global counters use the same mechanism. Create a function that contains the common code. Signed-off-by: Christoph Lameter Cc: KOSAKI Motohiro CC: Tejun Heo Cc: Joonsoo Kim Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index aaee66330e01..158ca6494bc6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -414,6 +414,15 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif +static inline void fold_diff(int *diff) +{ + int i; + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + if (diff[i]) + atomic_long_add(diff[i], &vm_stat[i]); +} + /* * Update the zone counters for the current cpu. * @@ -483,10 +492,7 @@ static void refresh_cpu_vm_stats(int cpu) drain_zone_pages(zone, &p->pcp); #endif } - - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (global_diff[i]) - atomic_long_add(global_diff[i], &vm_stat[i]); + fold_diff(global_diff); } /* @@ -516,9 +522,7 @@ void cpu_vm_stats_fold(int cpu) } } - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (global_diff[i]) - atomic_long_add(global_diff[i], &vm_stat[i]); + fold_diff(global_diff); } /* From fbc2edb05354480a88aa39db8a6acb5782fa1a1b Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 11 Sep 2013 14:21:32 -0700 Subject: [PATCH 074/303] vmstat: use this_cpu() to avoid irqon/off sequence in refresh_cpu_vm_stats Disabling interrupts repeatedly can be avoided in the inner loop if we use a this_cpu operation. Signed-off-by: Christoph Lameter Cc: KOSAKI Motohiro CC: Tejun Heo Cc: Joonsoo Kim Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 158ca6494bc6..d57a09143bf9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -437,33 +437,29 @@ static inline void fold_diff(int *diff) * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. */ -static void refresh_cpu_vm_stats(int cpu) +static void refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; for_each_populated_zone(zone) { - struct per_cpu_pageset *p; + struct per_cpu_pageset __percpu *p = zone->pageset; - p = per_cpu_ptr(zone->pageset, cpu); + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + int v; - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (p->vm_stat_diff[i]) { - unsigned long flags; - int v; + v = this_cpu_xchg(p->vm_stat_diff[i], 0); + if (v) { - local_irq_save(flags); - v = p->vm_stat_diff[i]; - p->vm_stat_diff[i] = 0; - local_irq_restore(flags); atomic_long_add(v, &zone->vm_stat[i]); global_diff[i] += v; #ifdef CONFIG_NUMA /* 3 seconds idle till flush */ - p->expire = 3; + __this_cpu_write(p->expire, 3); #endif } + } cond_resched(); #ifdef CONFIG_NUMA /* @@ -473,23 +469,24 @@ static void refresh_cpu_vm_stats(int cpu) * Check if there are pages remaining in this pageset * if not then there is nothing to expire. */ - if (!p->expire || !p->pcp.count) + if (!__this_cpu_read(p->expire) || + !__this_cpu_read(p->pcp.count)) continue; /* * We never drain zones local to this processor. */ if (zone_to_nid(zone) == numa_node_id()) { - p->expire = 0; + __this_cpu_write(p->expire, 0); continue; } - p->expire--; - if (p->expire) + + if (__this_cpu_dec_return(p->expire)) continue; - if (p->pcp.count) - drain_zone_pages(zone, &p->pcp); + if (__this_cpu_read(p->pcp.count)) + drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); #endif } fold_diff(global_diff); @@ -1216,7 +1213,7 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(smp_processor_id()); + refresh_cpu_vm_stats(); schedule_delayed_work(&__get_cpu_var(vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } From 6b70f7dff8f7ce2f4692afc7d4ef9f73f8c82434 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:39 -0700 Subject: [PATCH 075/303] mm, vmalloc: remove useless variable in vmap_block vbq in vmap_block isn't used. So remove it. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Acked-by: Johannes Weiner Acked-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 13a54953a273..d23c43258727 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -752,7 +752,6 @@ struct vmap_block_queue { struct vmap_block { spinlock_t lock; struct vmap_area *va; - struct vmap_block_queue *vbq; unsigned long free, dirty; DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; @@ -830,7 +829,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) radix_tree_preload_end(); vbq = &get_cpu_var(vmap_block_queue); - vb->vbq = vbq; spin_lock(&vbq->lock); list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); From b136be5e0b6e8e3e4dcb6722b51bb35199b06810 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:40 -0700 Subject: [PATCH 076/303] mm, vmalloc: use well-defined find_last_bit() func Our intention in here is to find last_bit within the region to flush. There is well-defined function, find_last_bit() for this purpose and its performance may be slightly better than current implementation. So change it. Signed-off-by: Joonsoo Kim Reviewed-by: Wanpeng Li Acked-by: Johannes Weiner Acked-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d23c43258727..93d3182c3300 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1016,15 +1016,16 @@ void vm_unmap_aliases(void) rcu_read_lock(); list_for_each_entry_rcu(vb, &vbq->free, free_list) { - int i; + int i, j; spin_lock(&vb->lock); i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); - while (i < VMAP_BBMAP_BITS) { + if (i < VMAP_BBMAP_BITS) { unsigned long s, e; - int j; - j = find_next_zero_bit(vb->dirty_map, - VMAP_BBMAP_BITS, i); + + j = find_last_bit(vb->dirty_map, + VMAP_BBMAP_BITS); + j = j + 1; /* need exclusive index */ s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); @@ -1034,10 +1035,6 @@ void vm_unmap_aliases(void) start = s; if (e > end) end = e; - - i = j; - i = find_next_bit(vb->dirty_map, - VMAP_BBMAP_BITS, i); } spin_unlock(&vb->lock); } From 37b000b640741132eddaa9fbeca1f988139ad7e2 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:21:41 -0700 Subject: [PATCH 077/303] mm/hotplug: remove unnecessary BUG_ON in __offline_pages() I think we can remove "BUG_ON(start_pfn >= end_pfn)" in __offline_pages(), because in memory_block_action() "nr_pages = PAGES_PER_SECTION * sections_per_block" is always greater than 0. memory_block_action() offline_pages() __offline_pages() BUG_ON(start_pfn >= end_pfn) In v2.6.32, If info->length==0, this way may hit this BUG_ON(). acpi_memory_disable_device() remove_memory(info->start_addr, info->length) offline_pages() A later Fujitsu patch renamed this function and the BUG_ON() is unnecessary. Signed-off-by: Xishi Qiu Reviewed-by: Dave Hansen Cc: Toshi Kani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ca1dd3aa5eee..8e333f953f08 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1472,7 +1472,6 @@ static int __ref __offline_pages(unsigned long start_pfn, struct zone *zone; struct memory_notify arg; - BUG_ON(start_pfn >= end_pfn); /* at least, alignment against pageblock is necessary */ if (!IS_ALIGNED(start_pfn, pageblock_nr_pages)) return -EINVAL; From eee87e1726af8c746f0e15ae6c57a97675f5e960 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Wed, 11 Sep 2013 14:21:42 -0700 Subject: [PATCH 078/303] mm/zbud: fix some trivial typos in comments Signed-off-by: Jianguo Wu Cc: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/zbud.c b/mm/zbud.c index ad1e781284fd..9451361e6aa7 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -16,7 +16,7 @@ * * zbud works by storing compressed pages, or "zpages", together in pairs in a * single memory page called a "zbud page". The first buddy is "left - * justifed" at the beginning of the zbud page, and the last buddy is "right + * justified" at the beginning of the zbud page, and the last buddy is "right * justified" at the end of the zbud page. The benefit is that if either * buddy is freed, the freed buddy space, coalesced with whatever slack space * that existed between the buddies, results in the largest possible free region @@ -243,7 +243,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used * as zbud pool pages. * - * Return: 0 if success and handle is set, otherwise -EINVAL is the size or + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ From 674470d97958a0ec72f72caf7f6451da40159cc7 Mon Sep 17 00:00:00 2001 From: Joonyoung Shim Date: Wed, 11 Sep 2013 14:21:43 -0700 Subject: [PATCH 079/303] lib/genalloc.c: fix overflow of ending address of memory chunk In struct gen_pool_chunk, end_addr means the end address of memory chunk (inclusive), but in the implementation it is treated as address + size of memory chunk (exclusive), so it points to the address plus one instead of correct ending address. The ending address of memory chunk plus one will cause overflow on the memory chunk including the last address of memory map, e.g. when starting address is 0xFFF00000 and size is 0x100000 on 32bit machine, ending address will be 0x100000000. Use correct ending address like starting address + size - 1. [akpm@linux-foundation.org: add comment to struct gen_pool_chunk:end_addr] Signed-off-by: Joonyoung Shim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 4 ++-- lib/genalloc.c | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 661d374aeb2d..f8d41cb1cbe0 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -66,8 +66,8 @@ struct gen_pool_chunk { struct list_head next_chunk; /* next chunk in pool */ atomic_t avail; phys_addr_t phys_addr; /* physical starting address of memory chunk */ - unsigned long start_addr; /* starting address of memory chunk */ - unsigned long end_addr; /* ending address of memory chunk */ + unsigned long start_addr; /* start address of memory chunk */ + unsigned long end_addr; /* end address of memory chunk (inclusive) */ unsigned long bits[0]; /* bitmap for allocating memory chunk */ }; diff --git a/lib/genalloc.c b/lib/genalloc.c index b35cfa9bc3d4..2a39bf62d8c1 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -37,6 +37,11 @@ #include #include +static inline size_t chunk_size(const struct gen_pool_chunk *chunk) +{ + return chunk->end_addr - chunk->start_addr + 1; +} + static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set) { unsigned long val, nval; @@ -188,7 +193,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy chunk->phys_addr = phys; chunk->start_addr = virt; - chunk->end_addr = virt + size; + chunk->end_addr = virt + size - 1; atomic_set(&chunk->avail, size); spin_lock(&pool->lock); @@ -213,7 +218,7 @@ phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr) rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { - if (addr >= chunk->start_addr && addr < chunk->end_addr) { + if (addr >= chunk->start_addr && addr <= chunk->end_addr) { paddr = chunk->phys_addr + (addr - chunk->start_addr); break; } @@ -242,7 +247,7 @@ void gen_pool_destroy(struct gen_pool *pool) chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk); list_del(&chunk->next_chunk); - end_bit = (chunk->end_addr - chunk->start_addr) >> order; + end_bit = chunk_size(chunk) >> order; bit = find_next_bit(chunk->bits, end_bit, 0); BUG_ON(bit < end_bit); @@ -283,7 +288,7 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size) if (size > atomic_read(&chunk->avail)) continue; - end_bit = (chunk->end_addr - chunk->start_addr) >> order; + end_bit = chunk_size(chunk) >> order; retry: start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits, pool->data); @@ -330,8 +335,8 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size) nbits = (size + (1UL << order) - 1) >> order; rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) { - if (addr >= chunk->start_addr && addr < chunk->end_addr) { - BUG_ON(addr + size > chunk->end_addr); + if (addr >= chunk->start_addr && addr <= chunk->end_addr) { + BUG_ON(addr + size - 1 > chunk->end_addr); start_bit = (addr - chunk->start_addr) >> order; remain = bitmap_clear_ll(chunk->bits, start_bit, nbits); BUG_ON(remain); @@ -400,7 +405,7 @@ size_t gen_pool_size(struct gen_pool *pool) rcu_read_lock(); list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) - size += chunk->end_addr - chunk->start_addr; + size += chunk_size(chunk); rcu_read_unlock(); return size; } From c33bc315fd921b1179a1d3df5756e0da6fb73944 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:21:44 -0700 Subject: [PATCH 080/303] mm: use zone_end_pfn() instead of zone_start_pfn+spanned_pages Use "zone_end_pfn()" instead of "zone->zone_start_pfn + zone->spanned_pages". Simplify the code, no functional change. [akpm@linux-foundation.org: fix build] Signed-off-by: Xishi Qiu Cc: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 12 ++++++------ mm/memory_hotplug.c | 7 ++++--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 349587bb03e1..358a146fd4da 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) struct mem_extent *ext, *cur, *aux; zone_start = zone->zone_start_pfn; - zone_end = zone->zone_start_pfn + zone->spanned_pages; + zone_end = zone_end_pfn(zone); list_for_each_entry(ext, list, hook) if (zone_start <= ext->end) @@ -884,7 +884,7 @@ static unsigned int count_highmem_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_highmem_page(zone, pfn)) n++; @@ -948,7 +948,7 @@ static unsigned int count_data_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_page(zone, pfn)) n++; @@ -1041,7 +1041,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) unsigned long max_zone_pfn; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); @@ -1093,7 +1093,7 @@ void swsusp_free(void) unsigned long pfn, max_zone_pfn; for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1755,7 +1755,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) /* Clear page flags */ for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) swsusp_unset_page_free(pfn_to_page(pfn)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8e333f953f08..9eadad626d64 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -229,7 +229,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); - old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + old_zone_end_pfn = zone_end_pfn(zone); if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; @@ -514,8 +514,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone, static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long zone_start_pfn = zone->zone_start_pfn; - unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ + unsigned long zone_end_pfn = z; unsigned long pfn; struct mem_section *ms; int nid = zone_to_nid(zone); From 8080fc038e91265e1002df7cae805fc17bb772fc Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:21:45 -0700 Subject: [PATCH 081/303] mm: use zone_is_empty() instead of if(zone->spanned_pages) Use "zone_is_empty()" instead of "if (zone->spanned_pages)". Simplify the code, no functional change. Signed-off-by: Xishi Qiu Cc: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 6 +++--- mm/page_alloc.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9eadad626d64..4f5df61d6016 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -230,7 +230,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); old_zone_end_pfn = zone_end_pfn(zone); - if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) + if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - @@ -305,7 +305,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, goto out_fail; /* use start_pfn for z1's start_pfn if z1 is empty */ - if (z1->spanned_pages) + if (!zone_is_empty(z1)) z1_start_pfn = z1->zone_start_pfn; else z1_start_pfn = start_pfn; @@ -347,7 +347,7 @@ static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, goto out_fail; /* use end_pfn for z2's end_pfn if z2 is empty */ - if (z2->spanned_pages) + if (!zone_is_empty(z2)) z2_end_pfn = zone_end_pfn(z2); else z2_end_pfn = end_pfn; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f885eb827159..7c3f8d7e2d8e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1306,7 +1306,7 @@ void mark_free_pages(struct zone *zone) int order, t; struct list_head *curr; - if (!zone->spanned_pages) + if (zone_is_empty(zone)) return; spin_lock_irqsave(&zone->lock, flags); From 139c2d75b4e81d449d97f1f8188b84529eb56708 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:21:46 -0700 Subject: [PATCH 082/303] mm: use zone_is_initialized() instead of if(zone->wait_table) Use "zone_is_initialized()" instead of "if (zone->wait_table)". Simplify the code, no functional change. Signed-off-by: Xishi Qiu Cc: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4f5df61d6016..46b489cacdd8 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -194,7 +194,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) zone = &pgdat->node_zones[0]; for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) { - if (zone->wait_table) { + if (zone_is_initialized(zone)) { nr_pages = zone->wait_table_hash_nr_entries * sizeof(wait_queue_head_t); nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT; From 2cad401801978b16ac6e43f10b8d60039670fcbc Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Wed, 11 Sep 2013 14:21:47 -0700 Subject: [PATCH 083/303] readahead: make context readahead more conservative This helps performance on moderately dense random reads on SSD. Transaction-Per-Second numbers provided by Taobao: QPS case ------------------------------------------------------- 7536 disable context readahead totally w/ patch: 7129 slower size rampup and start RA on the 3rd read 6717 slower size rampup w/o patch: 5581 unmodified context readahead Before, readahead will be started whenever reading page N+1 when it happen to read N recently. After patch, we'll only start readahead when *three* random reads happen to access pages N, N+1, N+2. The probability of this happening is extremely low for pure random reads, unless they are very dense, which actually deserves some readahead. Also start with a smaller readahead window. The impact to interleaved sequential reads should be small, because for a long run stream, the the small readahead window rampup phase is negletable. The context readahead actually benefits clustered random reads on HDD whose seek cost is pretty high. However as SSD is increasingly used for random read workloads it's better for the context readahead to concentrate on interleaved sequential reads. Another SSD rand read test from Miao # file size: 2GB # read IO amount: 625MB sysbench --test=fileio \ --max-requests=10000 \ --num-threads=1 \ --file-num=1 \ --file-block-size=64K \ --file-test-mode=rndrd \ --file-fsync-freq=0 \ --file-fsync-end=off run shows the performance of btrfs grows up from 69MB/s to 121MB/s, ext4 from 104MB/s to 121MB/s. Signed-off-by: Wu Fengguang Tested-by: Tao Ma Tested-by: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 829a77c62834..e4ed04149785 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -371,10 +371,10 @@ static int try_context_readahead(struct address_space *mapping, size = count_history_pages(mapping, ra, offset, max); /* - * no history pages: + * not enough history pages: * it could be a random read */ - if (!size) + if (size <= req_size) return 0; /* @@ -385,8 +385,8 @@ static int try_context_readahead(struct address_space *mapping, size *= 2; ra->start = offset; - ra->size = get_init_ra_size(size + req_size, max); - ra->async_size = ra->size; + ra->size = min(size + req_size, max); + ra->async_size = 1; return 1; } From 15610c86fa83ff778eb80d3cfaa71d6acceb628a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:21:48 -0700 Subject: [PATCH 084/303] hugepage: mention libhugetlbfs in doc Explicitly mention/recommend using the libhugetlbfs test cases when changing related kernel code. Developers that are unaware of the project can easily miss this and introduce potential regressions that may or may not be caught by community review. Also do some cleanups that make the document visually easier to view at a first glance. Signed-off-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 4ac359b7aa17..bdd4bb97fff7 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -165,6 +165,7 @@ which function as described above for the default huge page-sized case. Interaction of Task Memory Policy with Huge Page Allocation/Freeing +=================================================================== Whether huge pages are allocated and freed via the /proc interface or the /sysfs interface using the nr_hugepages_mempolicy attribute, the NUMA @@ -229,6 +230,7 @@ resulting effect on persistent huge page allocation is as follows: of huge pages over all on-lines nodes with memory. Per Node Hugepages Attributes +============================= A subset of the contents of the root huge page control directory in sysfs, described above, will be replicated under each the system device of each @@ -258,6 +260,7 @@ applied, from which node the huge page allocation will be attempted. Using Huge Pages +================ If the user applications are going to request huge pages using mmap system call, then it is required that system administrator mount a file system of @@ -296,20 +299,16 @@ calls, though the mount of filesystem will be required for using mmap calls without MAP_HUGETLB. For an example of how to use mmap with MAP_HUGETLB see map_hugetlb.c. -******************************************************************* +Examples +======== -/* - * map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c - */ +1) map_hugetlb: see tools/testing/selftests/vm/map_hugetlb.c -******************************************************************* +2) hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c -/* - * hugepage-shm: see tools/testing/selftests/vm/hugepage-shm.c - */ +3) hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c -******************************************************************* - -/* - * hugepage-mmap: see tools/testing/selftests/vm/hugepage-mmap.c - */ +4) The libhugetlbfs (http://libhugetlbfs.sourceforge.net) library provides a + wide range of userspace tools to help with huge page usability, environment + setup, and control. Furthermore it provides useful test cases that should be + used when modifying code to ensure no regressions are introduced. From 27356f54c8c32609ff45b4ed333bb64fb2eef374 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 11 Sep 2013 14:21:49 -0700 Subject: [PATCH 085/303] mm/hotplug: verify hotplug memory range add_memory() and remove_memory() can only handle a memory range aligned with section. There are problems when an unaligned range is added and then deleted as follows: - add_memory() with an unaligned range succeeds, but __add_pages() called from add_memory() adds a whole section of pages even though a given memory range is less than the section size. - remove_memory() to the added unaligned range hits BUG_ON() in __remove_pages(). This patch changes add_memory() and remove_memory() to check if a given memory range is aligned with section at the beginning. As the result, add_memory() fails with -EINVAL when a given range is unaligned, and does not add such memory range. This prevents remove_memory() to be called with an unaligned range as well. Note that remove_memory() has to use BUG_ON() since this function cannot fail. [akpm@linux-foundation.org: avoid printk warnings] Signed-off-by: Toshi Kani Acked-by: KOSAKI Motohiro Reviewed-by: Tang Chen Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 46b489cacdd8..247d66675a91 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1070,6 +1070,23 @@ out: return ret; } +static int check_hotplug_memory_range(u64 start, u64 size) +{ + u64 start_pfn = start >> PAGE_SHIFT; + u64 nr_pages = size >> PAGE_SHIFT; + + /* Memory range must be aligned with section */ + if ((start_pfn & ~PAGE_SECTION_MASK) || + (nr_pages % PAGES_PER_SECTION) || (!nr_pages)) { + pr_err("Section-unaligned hotplug range: start 0x%llx, size 0x%llx\n", + (unsigned long long)start, + (unsigned long long)size); + return -EINVAL; + } + + return 0; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -1079,6 +1096,10 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; + ret = check_hotplug_memory_range(start, size); + if (ret) + return ret; + lock_memory_hotplug(); res = register_memory_resource(start, size); @@ -1786,6 +1807,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) { int ret; + BUG_ON(check_hotplug_memory_range(start, size)); + lock_memory_hotplug(); /* From 0f1cfe9d0d06fe44c2b310401d2db101968e8c58 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Wed, 11 Sep 2013 14:21:50 -0700 Subject: [PATCH 086/303] mm/hotplug: remove stop_machine() from try_offline_node() lock_device_hotplug() serializes hotplug & online/offline operations. The lock is held in common sysfs online/offline interfaces and ACPI hotplug code paths. And here are the code paths: - CPU & Mem online/offline via sysfs online store_online()->lock_device_hotplug() - Mem online via sysfs state: store_mem_state()->lock_device_hotplug() - ACPI CPU & Mem hot-add: acpi_scan_bus_device_check()->lock_device_hotplug() - ACPI CPU & Mem hot-delete: acpi_scan_hot_remove()->lock_device_hotplug() try_offline_node() off-lines a node if all memory sections and cpus are removed on the node. It is called from acpi_processor_remove() and acpi_memory_remove_memory()->remove_memory() paths, both of which are in the ACPI hotplug code. try_offline_node() calls stop_machine() to stop all cpus while checking all cpu status with the assumption that the caller is not protected from CPU hotplug or CPU online/offline operations. However, the caller is always serialized with lock_device_hotplug(). Also, the code needs to be properly serialized with a lock, not by stopping all cpus at a random place with stop_machine(). This patch removes the use of stop_machine() in try_offline_node() and adds comments to try_offline_node() and remove_memory() that lock_device_hotplug() is required. Signed-off-by: Toshi Kani Acked-by: Rafael J. Wysocki Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Tang Chen Cc: Yasuaki Ishimatsu Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 247d66675a91..d595606728f9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1695,9 +1695,8 @@ static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) return ret; } -static int check_cpu_on_node(void *data) +static int check_cpu_on_node(pg_data_t *pgdat) { - struct pglist_data *pgdat = data; int cpu; for_each_present_cpu(cpu) { @@ -1712,10 +1711,9 @@ static int check_cpu_on_node(void *data) return 0; } -static void unmap_cpu_on_node(void *data) +static void unmap_cpu_on_node(pg_data_t *pgdat) { #ifdef CONFIG_ACPI_NUMA - struct pglist_data *pgdat = data; int cpu; for_each_possible_cpu(cpu) @@ -1724,10 +1722,11 @@ static void unmap_cpu_on_node(void *data) #endif } -static int check_and_unmap_cpu_on_node(void *data) +static int check_and_unmap_cpu_on_node(pg_data_t *pgdat) { - int ret = check_cpu_on_node(data); + int ret; + ret = check_cpu_on_node(pgdat); if (ret) return ret; @@ -1736,11 +1735,18 @@ static int check_and_unmap_cpu_on_node(void *data) * the cpu_to_node() now. */ - unmap_cpu_on_node(data); + unmap_cpu_on_node(pgdat); return 0; } -/* offline the node if all memory sections of this node are removed */ +/** + * try_offline_node + * + * Offline a node if all memory sections and cpus of the node are removed. + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call. + */ void try_offline_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -1766,7 +1772,7 @@ void try_offline_node(int nid) return; } - if (stop_machine(check_and_unmap_cpu_on_node, pgdat, NULL)) + if (check_and_unmap_cpu_on_node(pgdat)) return; /* @@ -1803,6 +1809,13 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); +/** + * remove_memory + * + * NOTE: The caller must call lock_device_hotplug() to serialize hotplug + * and online/offline operations before this call, as required by + * try_offline_node(). + */ void __ref remove_memory(int nid, u64 start, u64 size) { int ret; From 4ef91848043679b272a1a5b8e2879acf696ba9e2 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:51 -0700 Subject: [PATCH 087/303] mm, hugetlb: protect reserved pages when soft offlining a hugepage Don't use the reserve pool when soft offlining a hugepage. Check we have free pages outside the reserve pool before we dequeue the huge page. Otherwise, we can steal other's reserve page. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar Cc: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5b084c7b34c6..583db1948145 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -955,10 +955,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { - struct page *page; + struct page *page = NULL; spin_lock(&hugetlb_lock); - page = dequeue_huge_page_node(h, nid); + if (h->free_huge_pages - h->resv_huge_pages > 0) + page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); if (!page) From f522c3ac00a49128115f99a5fcb95a447601c1c3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:53 -0700 Subject: [PATCH 088/303] mm, hugetlb: change variable name reservations to resv 'reservations' is so long name as a variable and we use 'resv_map' to represent 'struct resv_map' in other place. To reduce confusion and unreadability, change it. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar Cc: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 583db1948145..204550ae29c8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1115,9 +1115,9 @@ static long vma_needs_reservation(struct hstate *h, } else { long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - err = region_chg(&reservations->regions, idx, idx + 1); + err = region_chg(&resv->regions, idx, idx + 1); if (err < 0) return err; return 0; @@ -1135,10 +1135,10 @@ static void vma_commit_reservation(struct hstate *h, } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* Mark this page used in the map. */ - region_add(&reservations->regions, idx, idx + 1); + region_add(&resv->regions, idx, idx + 1); } } @@ -2188,7 +2188,7 @@ out: static void hugetlb_vm_op_open(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); /* * This new VMA should share its siblings reservation map if present. @@ -2198,34 +2198,34 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) * after this open call completes. It is therefore safe to take a * new reference here without additional locking. */ - if (reservations) - kref_get(&reservations->refs); + if (resv) + kref_get(&resv->refs); } static void resv_map_put(struct vm_area_struct *vma) { - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); - if (!reservations) + if (!resv) return; - kref_put(&reservations->refs, resv_map_release); + kref_put(&resv->refs, resv_map_release); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *reservations = vma_resv_map(vma); + struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve; unsigned long start; unsigned long end; - if (reservations) { + if (resv) { start = vma_hugecache_offset(h, vma, vma->vm_start); end = vma_hugecache_offset(h, vma, vma->vm_end); reserve = (end - start) - - region_count(&reservations->regions, start, end); + region_count(&resv->regions, start, end); resv_map_put(vma); From 8bb3f12e7d4f7b043a7c5aa3831e72041e80dc4a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:54 -0700 Subject: [PATCH 089/303] mm, hugetlb: fix subpool accounting handling If we alloc hugepage with avoid_reserve, we don't dequeue reserved one. So, we should check subpool counter when avoid_reserve. This patch implement it. Signed-off-by: Joonsoo Kim Cc: Aneesh Kumar Cc: Naoya Horiguchi Cc: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 204550ae29c8..dec5772c8c5c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1164,13 +1164,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, chg = vma_needs_reservation(h, vma, addr); if (chg < 0) return ERR_PTR(-ENOMEM); - if (chg) - if (hugepage_subpool_get_pages(spool, chg)) + if (chg || avoid_reserve) + if (hugepage_subpool_get_pages(spool, 1)) return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); if (ret) { - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); @@ -1182,7 +1183,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); - hugepage_subpool_put_pages(spool, chg); + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); From 5944d0116c773319a48ea6812d1891aa6d0bbbbf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:55 -0700 Subject: [PATCH 090/303] mm, hugetlb: remove useless check about mapping type is_vma_resv_set(vma, HPAGE_RESV_OWNER) implys that this mapping is for private. So we don't need to check whether this mapping is for shared or not. This patch is just for clean-up. Signed-off-by: Joonsoo Kim Cc: Aneesh Kumar Cc: Naoya Horiguchi Reviewed-by: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dec5772c8c5c..f6347ec4fd0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2564,8 +2564,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_MAYSHARE) && - is_vma_resv_set(vma, HPAGE_RESV_OWNER) && + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; From 8312034f3604bc0339c40545c538116f4ddad152 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:57 -0700 Subject: [PATCH 091/303] mm, hugetlb: grab a page_table_lock after page_cache_release We don't need to grab a page_table_lock when we try to release a page. So, defer to grab a page_table_lock. Signed-off-by: Joonsoo Kim Reviewed-by: Naoya Horiguchi Cc: Aneesh Kumar Reviewed-by: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f6347ec4fd0a..5bf6468a8862 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2647,10 +2647,11 @@ retry_avoidcopy: } spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); page_cache_release(new_page); page_cache_release(old_page); + + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return 0; } From 07443a85ad90c7b62fbe11dcd3d6a1de1e10516f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Wed, 11 Sep 2013 14:21:58 -0700 Subject: [PATCH 092/303] mm, hugetlb: return a reserved page to a reserved pool if failed If we fail with a reserved page, just calling put_page() is not sufficient, because put_page() invoke free_huge_page() at last step and it doesn't know whether a page comes from a reserved pool or not. So it doesn't do anything related to reserved count. This makes reserve count lower than how we need, because reserve count already decrease in dequeue_huge_page_vma(). This patch fix this situation. Signed-off-by: Joonsoo Kim Cc: Aneesh Kumar Cc: Naoya Horiguchi Cc: Davidlohr Bueso Cc: David Gibson Cc: Wanpeng Li Cc: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5bf6468a8862..06315560bd23 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -572,6 +572,7 @@ retry_cpuset: if (!vma_has_reserves(vma, chg)) break; + SetPagePrivate(page); h->resv_huge_pages--; break; } @@ -629,15 +630,20 @@ static void free_huge_page(struct page *page) int nid = page_to_nid(page); struct hugepage_subpool *spool = (struct hugepage_subpool *)page_private(page); + bool restore_reserve; set_page_private(page, 0); page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); + restore_reserve = PagePrivate(page); spin_lock(&hugetlb_lock); hugetlb_cgroup_uncharge_page(hstate_index(h), pages_per_huge_page(h), page); + if (restore_reserve) + h->resv_huge_pages++; + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { /* remove the page from active list */ list_del(&page->lru); @@ -2636,6 +2642,8 @@ retry_avoidcopy: spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { + ClearPagePrivate(new_page); + /* Break COW */ huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, @@ -2747,6 +2755,7 @@ retry: goto retry; goto out; } + ClearPagePrivate(page); spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); @@ -2793,8 +2802,10 @@ retry: if (!huge_pte_none(huge_ptep_get(ptep))) goto backout; - if (anon_rmap) + if (anon_rmap) { + ClearPagePrivate(page); hugepage_add_new_anon_rmap(page, vma, address); + } else page_dup_rmap(page); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) From 31caf665e666b51fe36efd1e54031ed29e86c0b4 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:21:59 -0700 Subject: [PATCH 093/303] mm: migrate: make core migration code aware of hugepage Currently hugepage migration is available only for soft offlining, but it's also useful for some other users of page migration (clearly because users of hugepage can enjoy the benefit of mempolicy and memory hotplug.) So this patchset tries to extend such users to support hugepage migration. The target of this patchset is to enable hugepage migration for NUMA related system calls (migrate_pages(2), move_pages(2), and mbind(2)), and memory hotplug. This patchset does not add hugepage migration for memory compaction, because users of memory compaction mainly expect to construct thp by arranging raw pages, and there's little or no need to compact hugepages. CMA, another user of page migration, can have benefit from hugepage migration, but is not enabled to support it for now (just because of lack of testing and expertise in CMA.) Hugepage migration of non pmd-based hugepage (for example 1GB hugepage in x86_64, or hugepages in architectures like ia64) is not enabled for now (again, because of lack of testing.) As for how these are achived, I extended the API (migrate_pages()) to handle hugepage (with patch 1 and 2) and adjusted code of each caller to check and collect movable hugepages (with patch 3-7). Remaining 2 patches are kind of miscellaneous ones to avoid unexpected behavior. Patch 8 is about making sure that we only migrate pmd-based hugepages. And patch 9 is about choosing appropriate zone for hugepage allocation. My test is mainly functional one, simply kicking hugepage migration via each entry point and confirm that migration is done correctly. Test code is available here: git://github.com/Naoya-Horiguchi/test_hugepage_migration_extension.git And I always run libhugetlbfs test when changing hugetlbfs's code. With this patchset, no regression was found in the test. This patch (of 9): Before enabling each user of page migration to support hugepage, this patch enables the list of pages for migration to link not only LRU pages, but also hugepages. As a result, putback_movable_pages() and migrate_pages() can handle both of LRU pages and hugepages. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 4 ++++ mm/hugetlb.c | 23 ++++++++++++++++++++++- mm/migrate.c | 10 +++++++++- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index c2b1801a160b..bc8d8370cd0d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -66,6 +66,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, vm_flags_t vm_flags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); +bool isolate_huge_page(struct page *page, struct list_head *list); +void putback_active_hugepage(struct page *page); void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -134,6 +136,8 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) return 0; } +#define isolate_huge_page(p, l) false +#define putback_active_hugepage(p) do {} while (0) static inline void copy_huge_page(struct page *dst, struct page *src) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 06315560bd23..e51723866fb1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -48,7 +48,8 @@ static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; /* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, + * free_huge_pages, and surplus_huge_pages. */ DEFINE_SPINLOCK(hugetlb_lock); @@ -3422,3 +3423,23 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) return ret; } #endif + +bool isolate_huge_page(struct page *page, struct list_head *list) +{ + VM_BUG_ON(!PageHead(page)); + if (!get_page_unless_zero(page)) + return false; + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, list); + spin_unlock(&hugetlb_lock); + return true; +} + +void putback_active_hugepage(struct page *page) +{ + VM_BUG_ON(!PageHead(page)); + spin_lock(&hugetlb_lock); + list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist); + spin_unlock(&hugetlb_lock); + put_page(page); +} diff --git a/mm/migrate.c b/mm/migrate.c index 6f0c24438bba..b44a067fee10 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -100,6 +100,10 @@ void putback_movable_pages(struct list_head *l) struct page *page2; list_for_each_entry_safe(page, page2, l, lru) { + if (unlikely(PageHuge(page))) { + putback_active_hugepage(page); + continue; + } list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); @@ -1025,7 +1029,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, list_for_each_entry_safe(page, page2, from, lru) { cond_resched(); - rc = unmap_and_move(get_new_page, private, + if (PageHuge(page)) + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, mode); + else + rc = unmap_and_move(get_new_page, private, page, pass > 2, mode); switch(rc) { From b8ec1cee5a4375c1244b85709138a2eac2d89cb6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:01 -0700 Subject: [PATCH 094/303] mm: soft-offline: use migrate_pages() instead of migrate_huge_page() Currently migrate_huge_page() takes a pointer to a hugepage to be migrated as an argument, instead of taking a pointer to the list of hugepages to be migrated. This behavior was introduced in commit 189ebff28 ("hugetlb: simplify migrate_huge_page()"), and was OK because until now hugepage migration is enabled only for soft-offlining which migrates only one hugepage in a single call. But the situation will change in the later patches in this series which enable other users of page migration to support hugepage migration. They can kick migration for both of normal pages and hugepages in a single call, so we need to go back to original implementation which uses linked lists to collect the hugepages to be migrated. With this patch, soft_offline_huge_page() switches to use migrate_pages(), and migrate_huge_page() is not used any more. So let's remove it. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 5 ----- mm/memory-failure.c | 15 ++++++++++++--- mm/migrate.c | 28 ++-------------------------- 3 files changed, 14 insertions(+), 34 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a405d3dc0f61..6fe521420631 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -41,8 +41,6 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason); -extern int migrate_huge_page(struct page *, new_page_t x, - unsigned long private, enum migrate_mode mode); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -62,9 +60,6 @@ static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, enum migrate_mode mode, int reason) { return -ENOSYS; } -static inline int migrate_huge_page(struct page *page, new_page_t x, - unsigned long private, enum migrate_mode mode) - { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d84c5e5331bb..e05ed31c0f61 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1470,6 +1470,7 @@ static int soft_offline_huge_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); /* * This double-check of PageHWPoison is to avoid the race with @@ -1485,12 +1486,20 @@ static int soft_offline_huge_page(struct page *page, int flags) unlock_page(hpage); /* Keep page count to indicate a given hugepage is isolated. */ - ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC); - put_page(hpage); + list_move(&hpage->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); + if (ret > 0) + ret = -EIO; } else { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); diff --git a/mm/migrate.c b/mm/migrate.c index b44a067fee10..3ec47d3394c8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -979,6 +979,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, unlock_page(hpage); out: + if (rc != -EAGAIN) + putback_active_hugepage(hpage); put_page(new_hpage); if (result) { if (rc) @@ -1066,32 +1068,6 @@ out: return rc; } -int migrate_huge_page(struct page *hpage, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode) -{ - int pass, rc; - - for (pass = 0; pass < 10; pass++) { - rc = unmap_and_move_huge_page(get_new_page, private, - hpage, pass > 2, mode); - switch (rc) { - case -ENOMEM: - goto out; - case -EAGAIN: - /* try again */ - cond_resched(); - break; - case MIGRATEPAGE_SUCCESS: - goto out; - default: - rc = -EIO; - goto out; - } - } -out: - return rc; -} - #ifdef CONFIG_NUMA /* * Move a list of individual pages From e2d8cf405525d83e6ca42969be460f94b0339798 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:03 -0700 Subject: [PATCH 095/303] migrate: add hugepage migration code to migrate_pages() Extend check_range() to handle vma with VM_HUGETLB set. We will be able to migrate hugepage with migrate_pages(2) after applying the enablement patch which comes later in this series. Note that for larger hugepages (covered by pud entries, 1GB for x86_64 for example), we simply skip it now. Note that using pmd_huge/pud_huge assumes that hugepages are pointed to by pmd/pud. This is not true in some architectures implementing hugepage with other mechanisms like ia64, but it's OK because pmd_huge/pud_huge simply return 0 in such arch and page walker simply ignores such hugepages. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 27022ca890f8..4626be621e74 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -515,6 +515,30 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return addr != end; } +static void check_hugetlb_pmd_range(struct vm_area_struct *vma, pmd_t *pmd, + const nodemask_t *nodes, unsigned long flags, + void *private) +{ +#ifdef CONFIG_HUGETLB_PAGE + int nid; + struct page *page; + + spin_lock(&vma->vm_mm->page_table_lock); + page = pte_page(huge_ptep_get((pte_t *)pmd)); + nid = page_to_nid(page); + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + goto unlock; + /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ + if (flags & (MPOL_MF_MOVE_ALL) || + (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) + isolate_huge_page(page, private); +unlock: + spin_unlock(&vma->vm_mm->page_table_lock); +#else + BUG(); +#endif +} + static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, @@ -526,6 +550,13 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (!pmd_present(*pmd)) + continue; + if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { + check_hugetlb_pmd_range(vma, pmd, nodes, + flags, private); + continue; + } split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; @@ -547,6 +578,8 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); + if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) + continue; if (pud_none_or_clear_bad(pud)) continue; if (check_pmd_range(vma, pud, addr, next, nodes, @@ -638,9 +671,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, return ERR_PTR(-EFAULT); } - if (is_vm_hugetlb_page(vma)) - goto next; - if (flags & MPOL_MF_LAZY) { change_prot_numa(vma, start, endvma); goto next; @@ -993,7 +1023,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(page)) + return alloc_huge_page_node(page_hstate(compound_head(page)), + node); + else + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1023,7 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, err = migrate_pages(&pagelist, new_node_page, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } return err; From e632a938d914d271bec26e570d36c755a1e35e4c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:04 -0700 Subject: [PATCH 096/303] mm: migrate: add hugepage migration code to move_pages() Extend move_pages() to handle vma with VM_HUGETLB set. We will be able to migrate hugepage with move_pages(2) after applying the enablement patch which comes later in this series. We avoid getting refcount on tail pages of hugepage, because unlike thp, hugepage is not split and we need not care about races with splitting. And migration of larger (1GB for x86_64) hugepage are not enabled. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Cc: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 17 +++++++++++++++-- mm/migrate.c | 13 +++++++++++-- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c1c6d59b2b03..2b73dbde2274 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1481,7 +1481,8 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pud_none(*pud)) goto no_page_table; if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); + if (flags & FOLL_GET) + goto out; page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; } @@ -1492,8 +1493,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, if (pmd_none(*pmd)) goto no_page_table; if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } goto out; } if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) diff --git a/mm/migrate.c b/mm/migrate.c index 3ec47d3394c8..d3137375fa80 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1092,7 +1092,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_exact_node(pm->node, + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + pm->node); + else + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } @@ -1152,6 +1156,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, !migrate_all) goto put_and_set; + if (PageHuge(page)) { + isolate_huge_page(page, &pagelist); + goto put_and_set; + } + err = isolate_lru_page(page); if (!err) { list_add_tail(&page->lru, &pagelist); @@ -1174,7 +1183,7 @@ set_status: err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } up_read(&mm->mmap_sem); From 74060e4d78795c7c43805133cb717d82533d4e0d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:06 -0700 Subject: [PATCH 097/303] mm: mbind: add hugepage migration code to mbind() Extend do_mbind() to handle vma with VM_HUGETLB set. We will be able to migrate hugepage with mbind(2) after applying the enablement patch which comes later in this series. Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Acked-by: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 +++ mm/hugetlb.c | 14 ++++++++++++++ mm/mempolicy.c | 4 +++- 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bc8d8370cd0d..d1db00790a84 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -265,6 +265,8 @@ struct huge_bootmem_page { }; struct page *alloc_huge_page_node(struct hstate *h, int nid); +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve); /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); @@ -378,6 +380,7 @@ static inline pgoff_t basepage_index(struct page *page) #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL +#define alloc_huge_page_noerr(v, a, r) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e51723866fb1..d37b3b95c439 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1207,6 +1207,20 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } +/* + * alloc_huge_page()'s wrapper which simply returns the page if allocation + * succeeds, otherwise NULL. This function is called from new_vma_page(), + * where no ERR_VALUE is expected to be returned. + */ +struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, + unsigned long addr, int avoid_reserve) +{ + struct page *page = alloc_huge_page(vma, addr, avoid_reserve); + if (IS_ERR(page)) + page = NULL; + return page; +} + int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4626be621e74..c7c359213ae1 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1192,6 +1192,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * vma = vma->vm_next; } + if (PageHuge(page)) + return alloc_huge_page_noerr(vma, address, 1); /* * if !vma, alloc_page_vma() will use task or system default policy */ @@ -1302,7 +1304,7 @@ static long do_mbind(unsigned long start, unsigned long len, (unsigned long)vma, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) - putback_lru_pages(&pagelist); + putback_movable_pages(&pagelist); } if (nr_failed && (flags & MPOL_MF_STRICT)) From 71ea2efb1e936a127690a0a540b3a6162f95e48a Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:08 -0700 Subject: [PATCH 098/303] mm: migrate: remove VM_HUGETLB from vma flag check in vma_migratable() Enable hugepage migration from migrate_pages(2), move_pages(2), and mbind(2). Signed-off-by: Naoya Horiguchi Acked-by: Hillf Danton Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index b2f897789838..da6716b9e3fe 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -173,7 +173,7 @@ extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) { - if (vma->vm_flags & (VM_IO | VM_HUGETLB | VM_PFNMAP)) + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return 0; /* * Migration allocates pages in the highest zone. If we cannot From c8721bbbdd36382de51cd6b7a56322e0acca2414 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:09 -0700 Subject: [PATCH 099/303] mm: memory-hotplug: enable memory hotplug to handle hugepage Until now we can't offline memory blocks which contain hugepages because a hugepage is considered as an unmovable page. But now with this patch series, a hugepage has become movable, so by using hugepage migration we can offline such memory blocks. What's different from other users of hugepage migration is that we need to decompose all the hugepages inside the target memory block into free buddy pages after hugepage migration, because otherwise free hugepages remaining in the memory block intervene the memory offlining. For this reason we introduce new functions dissolve_free_huge_page() and dissolve_free_huge_pages(). Other than that, what this patch does is straightforwardly to add hugepage migration code, that is, adding hugepage code to the functions which scan over pfn and collect hugepages to be migrated, and adding a hugepage allocation function to alloc_migrate_target(). As for larger hugepages (1GB for x86_64), it's not easy to do hotremove over them because it's larger than memory block. So we now simply leave it to fail as it is. [yongjun_wei@trendmicro.com.cn: remove duplicated include] Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Cc: Hillf Danton Cc: Wanpeng Li Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Wei Yongjun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 6 ++++ mm/hugetlb.c | 71 +++++++++++++++++++++++++++++++++++++++-- mm/memory_hotplug.c | 42 ++++++++++++++++++++---- mm/page_alloc.c | 11 +++++++ mm/page_isolation.c | 14 ++++++++ 5 files changed, 135 insertions(+), 9 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d1db00790a84..2e02c4ed1035 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -68,6 +68,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); int dequeue_hwpoisoned_huge_page(struct page *page); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +bool is_hugepage_active(struct page *page); void copy_huge_page(struct page *dst, struct page *src); #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE @@ -138,6 +139,7 @@ static inline int dequeue_hwpoisoned_huge_page(struct page *page) #define isolate_huge_page(p, l) false #define putback_active_hugepage(p) do {} while (0) +#define is_hugepage_active(x) false static inline void copy_huge_page(struct page *dst, struct page *src) { } @@ -377,6 +379,9 @@ static inline pgoff_t basepage_index(struct page *page) return __basepage_index(page); } +extern void dissolve_free_huge_pages(unsigned long start_pfn, + unsigned long end_pfn); + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -403,6 +408,7 @@ static inline pgoff_t basepage_index(struct page *page) { return page->index; } +#define dissolve_free_huge_pages(s, e) do {} while (0) #endif /* CONFIG_HUGETLB_PAGE */ #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d37b3b95c439..fb4293b93fd0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -522,9 +523,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (list_empty(&h->hugepage_freelists[nid])) + list_for_each_entry(page, &h->hugepage_freelists[nid], lru) + if (!is_migrate_isolate_page(page)) + break; + /* + * if 'non-isolated free hugepage' not found on the list, + * the allocation fails. + */ + if (&h->hugepage_freelists[nid] == &page->lru) return NULL; - page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; @@ -878,6 +885,44 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } +/* + * Dissolve a given free hugepage into free buddy pages. This function does + * nothing for in-use (including surplus) hugepages. + */ +static void dissolve_free_huge_page(struct page *page) +{ + spin_lock(&hugetlb_lock); + if (PageHuge(page) && !page_count(page)) { + struct hstate *h = page_hstate(page); + int nid = page_to_nid(page); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + update_and_free_page(h, page); + } + spin_unlock(&hugetlb_lock); +} + +/* + * Dissolve free hugepages in a given pfn range. Used by memory hotplug to + * make specified memory blocks removable from the system. + * Note that start_pfn should aligned with (minimum) hugepage size. + */ +void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned int order = 8 * sizeof(void *); + unsigned long pfn; + struct hstate *h; + + /* Set scan step to minimum hugepage size */ + for_each_hstate(h) + if (order > huge_page_order(h)) + order = huge_page_order(h); + VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << order)); + for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << order) + dissolve_free_huge_page(pfn_to_page(pfn)); +} + static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; @@ -3457,3 +3502,25 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } + +bool is_hugepage_active(struct page *page) +{ + VM_BUG_ON(!PageHuge(page)); + /* + * This function can be called for a tail page because the caller, + * scan_movable_pages, scans through a given pfn-range which typically + * covers one memory block. In systems using gigantic hugepage (1GB + * for x86_64,) a hugepage is larger than a memory block, and we don't + * support migrating such large hugepages for now, so return false + * when called for tail pages. + */ + if (PageTail(page)) + return false; + /* + * Refcount of a hwpoisoned hugepages is 1, but they are not active, + * so we should return false for them. + */ + if (unlikely(PageHWPoison(page))) + return false; + return page_count(page) > 0; +} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d595606728f9..0eb1a1df649d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -1230,10 +1231,12 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) } /* - * Scanning pfn is much easier than scanning lru list. - * Scan pfn from start to end and Find LRU page. + * Scan pfn range [start,end) to find movable/migratable pages (LRU pages + * and hugepages). We scan pfn because it's much easier than scanning over + * linked list. This function returns the pfn of the first found movable + * page if it's found, otherwise 0. */ -static unsigned long scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_movable_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -1242,6 +1245,13 @@ static unsigned long scan_lru_pages(unsigned long start, unsigned long end) page = pfn_to_page(pfn); if (PageLRU(page)) return pfn; + if (PageHuge(page)) { + if (is_hugepage_active(page)) + return pfn; + else + pfn = round_up(pfn + 1, + 1 << compound_order(page)) - 1; + } } } return 0; @@ -1262,6 +1272,19 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); + + if (PageHuge(page)) { + struct page *head = compound_head(page); + pfn = page_to_pfn(head) + (1< PFN_SECTION_SHIFT) { + ret = -EBUSY; + break; + } + if (isolate_huge_page(page, &source)) + move_pages -= 1 << compound_order(head); + continue; + } + if (!get_page_unless_zero(page)) continue; /* @@ -1294,7 +1317,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } if (!list_empty(&source)) { if (not_managed) { - putback_lru_pages(&source); + putback_movable_pages(&source); goto out; } @@ -1305,7 +1328,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) ret = migrate_pages(&source, alloc_migrate_target, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) - putback_lru_pages(&source); + putback_movable_pages(&source); } out: return ret; @@ -1548,8 +1571,8 @@ repeat: drain_all_pages(); } - pfn = scan_lru_pages(start_pfn, end_pfn); - if (pfn) { /* We have page on LRU */ + pfn = scan_movable_pages(start_pfn, end_pfn); + if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); if (!ret) { drain = 1; @@ -1568,6 +1591,11 @@ repeat: yield(); /* drain pcp pages, this is synchronous. */ drain_all_pages(); + /* + * dissolve free hugepages in the memory block before doing offlining + * actually in order to make hugetlbfs's object counting consistent. + */ + dissolve_free_huge_pages(start_pfn, end_pfn); /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); if (offlined_pages < 0) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7c3f8d7e2d8e..f7cc08dad26a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6008,6 +6008,17 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, continue; page = pfn_to_page(check); + + /* + * Hugepages are not in LRU lists, but they're movable. + * We need not scan over tail pages bacause we don't + * handle each tail page individually in migration. + */ + if (PageHuge(page)) { + iter = round_up(iter + 1, 1< #include #include +#include #include "internal.h" int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) @@ -252,6 +253,19 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, { gfp_t gfp_mask = GFP_USER | __GFP_MOVABLE; + /* + * TODO: allocate a destination hugepage from a nearest neighbor node, + * accordance with memory policy of the user process if possible. For + * now as a simple work-around, we use the next node for destination. + */ + if (PageHuge(page)) { + nodemask_t src = nodemask_of_node(page_to_nid(page)); + nodemask_t dst; + nodes_complement(dst, src); + return alloc_huge_page_node(page_hstate(compound_head(page)), + next_node(page_to_nid(page), dst)); + } + if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; From 83467efbdb7948146581a56cbd683a22a0684bbb Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:11 -0700 Subject: [PATCH 100/303] mm: migrate: check movability of hugepage in unmap_and_move_huge_page() Currently hugepage migration works well only for pmd-based hugepages (mainly due to lack of testing,) so we had better not enable migration of other levels of hugepages until we are ready for it. Some users of hugepage migration (mbind, move_pages, and migrate_pages) do page table walk and check pud/pmd_huge() there, so they are safe. But the other users (softoffline and memory hotremove) don't do this, so without this patch they can try to migrate unexpected types of hugepages. To prevent this, we introduce hugepage_migration_support() as an architecture dependent check of whether hugepage are implemented on a pmd basis or not. And on some architecture multiple sizes of hugepages are available, so hugepage_migration_support() also checks hugepage size. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Hillf Danton Cc: Wanpeng Li Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/hugetlbpage.c | 5 +++++ arch/arm64/mm/hugetlbpage.c | 5 +++++ arch/ia64/mm/hugetlbpage.c | 5 +++++ arch/metag/mm/hugetlbpage.c | 5 +++++ arch/mips/mm/hugetlbpage.c | 5 +++++ arch/powerpc/mm/hugetlbpage.c | 10 ++++++++++ arch/s390/mm/hugetlbpage.c | 5 +++++ arch/sh/mm/hugetlbpage.c | 5 +++++ arch/sparc/mm/hugetlbpage.c | 5 +++++ arch/tile/mm/hugetlbpage.c | 5 +++++ arch/x86/mm/hugetlbpage.c | 8 ++++++++ include/linux/hugetlb.h | 12 ++++++++++++ mm/migrate.c | 10 ++++++++++ 13 files changed, 85 insertions(+) diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 66781bf34077..54ee6163c181 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -56,3 +56,8 @@ int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); } + +int pmd_huge_support(void) +{ + return 1; +} diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2fc8258bab2d..5e9aec358306 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -54,6 +54,11 @@ int pud_huge(pud_t pud) return !(pud_val(pud) & PUD_TABLE_BIT); } +int pmd_huge_support(void) +{ + return 1; +} + static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 76069c18ee42..68232db98baa 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -114,6 +114,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 0; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 3c52fa6d0f8e..042431509b56 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -110,6 +110,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 1; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index a7fee0dfb7a9..01fda4419ed0 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -85,6 +85,11 @@ int pud_huge(pud_t pud) return (pud_val(pud) & _PAGE_HUGE) != 0; } +int pmd_huge_support(void) +{ + return 1; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 834ca8eb38f2..d67db4bd672d 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -86,6 +86,11 @@ int pgd_huge(pgd_t pgd) */ return ((pgd_val(pgd) & 0x3) != 0x0); } + +int pmd_huge_support(void) +{ + return 1; +} #else int pmd_huge(pmd_t pmd) { @@ -101,6 +106,11 @@ int pgd_huge(pgd_t pgd) { return 0; } + +int pmd_huge_support(void) +{ + return 0; +} #endif pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 248445f92604..d261c62e40a6 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -223,6 +223,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 1; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int write) { diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index d7762349ea48..0d676a41081e 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -83,6 +83,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 0; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index d2b59441ebdd..96399646570a 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -234,6 +234,11 @@ int pud_huge(pud_t pud) return 0; } +int pmd_huge_support(void) +{ + return 0; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index e514899e1100..0cb3bbaa580c 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -166,6 +166,11 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } +int pmd_huge_support(void) +{ + return 1; +} + struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 7e73e8c69096..9d980d88b747 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -59,6 +59,10 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, return NULL; } +int pmd_huge_support(void) +{ + return 0; +} #else struct page * @@ -77,6 +81,10 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_PSE); } +int pmd_huge_support(void) +{ + return 1; +} #endif /* x86_64 also uses this file */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2e02c4ed1035..0393270466c3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -381,6 +381,16 @@ static inline pgoff_t basepage_index(struct page *page) extern void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); +int pmd_huge_support(void); +/* + * Currently hugepage migration is enabled only for pmd-based hugepage. + * This function will be updated when hugepage migration is more widely + * supported. + */ +static inline int hugepage_migration_support(struct hstate *h) +{ + return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); +} #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -409,6 +419,8 @@ static inline pgoff_t basepage_index(struct page *page) return page->index; } #define dissolve_free_huge_pages(s, e) do {} while (0) +#define pmd_huge_support() 0 +#define hugepage_migration_support(h) 0 #endif /* CONFIG_HUGETLB_PAGE */ #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/migrate.c b/mm/migrate.c index d3137375fa80..61f14a1923fd 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -949,6 +949,16 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, struct page *new_hpage = get_new_page(hpage, private, &result); struct anon_vma *anon_vma = NULL; + /* + * Movability of hugepages depends on architectures and hugepage size. + * This check is necessary because some callers of hugepage migration + * like soft offline and memory hotremove don't walk through page + * tables or check whether the hugepage is pmd-based or not before + * kicking migration. + */ + if (!hugepage_migration_support(page_hstate(hpage))) + return -ENOSYS; + if (!new_hpage) return -ENOMEM; From 86cdb465cf3a9d81058b517af05074157fa9dcdd Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:13 -0700 Subject: [PATCH 101/303] mm: prepare to remove /proc/sys/vm/hugepages_treat_as_movable Now hugepage migration is enabled, although restricted on pmd-based hugepages for now (due to lack of testing.) So we should allocate migratable hugepages from ZONE_MOVABLE if possible. This patch makes GFP flags in hugepage allocation dependent on migration support, not only the value of hugepages_treat_as_movable. It provides no change on the behavior for architectures which do not support hugepage migration, Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Cc: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 28 ++++++++++++++++++---------- kernel/sysctl.c | 2 +- mm/hugetlb.c | 32 ++++++++++++++------------------ 3 files changed, 33 insertions(+), 29 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 36ecc26c7433..79a797eb3e87 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -200,17 +200,25 @@ fragmentation index is <= extfrag_threshold. The default value is 500. hugepages_treat_as_movable -This parameter is only useful when kernelcore= is specified at boot time to -create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages -are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero -value written to hugepages_treat_as_movable allows huge pages to be allocated -from ZONE_MOVABLE. +This parameter controls whether we can allocate hugepages from ZONE_MOVABLE +or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. +ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, +so this parameter has no effect if used without kernelcore=. -Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge -pages pool can easily grow or shrink within. Assuming that applications are -not running that mlock() a lot of memory, it is likely the huge pages pool -can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value -into nr_hugepages and triggering page reclaim. +Hugepage migration is now available in some situations which depend on the +architecture and/or the hugepage size. If a hugepage supports migration, +allocation from ZONE_MOVABLE is always enabled for the hugepage regardless +of the value of this parameter. +IOW, this parameter affects only non-migratable hugepages. + +Assuming that hugepages are not migratable in your system, one usecase of +this parameter is that users can make hugepage pool more extensible by +enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE +page reclaim/migration/compaction work more and you can get contiguous +memory more likely. Note that using ZONE_MOVABLE for non-migratable +hugepages can do harm to other features like memory hotremove (because +memory hotremove expects that memory blocks on ZONE_MOVABLE are always +removable,) so it's a trade-off responsible for the users. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..dc69093a8ec4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = { .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_treat_movable_handler, + .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fb4293b93fd0..b49579c7f2a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,7 +34,6 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -539,6 +538,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_support(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -568,11 +576,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + htlb_alloc_mask(h), &mpol, &nodemask); for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -738,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return NULL; page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { @@ -965,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { @@ -2117,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, buffer, length, ppos); - if (hugepages_treat_as_movable) - htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; - else - htlb_alloc_mask = GFP_HIGHUSER; - return 0; -} - int hugetlb_overcommit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) From 98094945785464c657d598291d714d11694c8cd9 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:14 -0700 Subject: [PATCH 102/303] mm/mempolicy: rename check_*range to queue_pages_*range The function check_range() (and its family) is not well-named, because it does not only checking something, but moving pages from list to list to do page migration for them. So queue_pages_*range is more desirable name. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Wanpeng Li Cc: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c7c359213ae1..9d778637b088 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -476,8 +476,11 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); -/* Scan through pages checking if pages follow certain conditions. */ -static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * Scan through pages checking if pages follow certain conditions, + * and move them to the pagelist if they do. + */ +static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -515,8 +518,8 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, return addr != end; } -static void check_hugetlb_pmd_range(struct vm_area_struct *vma, pmd_t *pmd, - const nodemask_t *nodes, unsigned long flags, +static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, + pmd_t *pmd, const nodemask_t *nodes, unsigned long flags, void *private) { #ifdef CONFIG_HUGETLB_PAGE @@ -539,7 +542,7 @@ unlock: #endif } -static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -553,21 +556,21 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, if (!pmd_present(*pmd)) continue; if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) { - check_hugetlb_pmd_range(vma, pmd, nodes, + queue_pages_hugetlb_pmd_range(vma, pmd, nodes, flags, private); continue; } split_huge_page_pmd(vma, addr, pmd); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; - if (check_pte_range(vma, pmd, addr, next, nodes, + if (queue_pages_pte_range(vma, pmd, addr, next, nodes, flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; } -static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -582,14 +585,14 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, continue; if (pud_none_or_clear_bad(pud)) continue; - if (check_pmd_range(vma, pud, addr, next, nodes, + if (queue_pages_pmd_range(vma, pud, addr, next, nodes, flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; } -static inline int check_pgd_range(struct vm_area_struct *vma, +static inline int queue_pages_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) @@ -602,7 +605,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - if (check_pud_range(vma, pgd, addr, next, nodes, + if (queue_pages_pud_range(vma, pgd, addr, next, nodes, flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); @@ -640,12 +643,14 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, #endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ /* - * Check if all pages in a range are on a set of nodes. - * If pagelist != NULL then isolate pages from the LRU and - * put them on the pagelist. + * Walk through page tables and collect pages to be migrated. + * + * If pages found in a given range are on a set of nodes (determined by + * @nodes and @flags,) it's isolated and queued to the pagelist which is + * passed via @private.) */ static struct vm_area_struct * -check_range(struct mm_struct *mm, unsigned long start, unsigned long end, +queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, const nodemask_t *nodes, unsigned long flags, void *private) { int err; @@ -680,7 +685,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && vma_migratable(vma))) { - err = check_pgd_range(vma, start, endvma, nodes, + err = queue_pages_pgd_range(vma, start, endvma, nodes, flags, private); if (err) { first = ERR_PTR(err); @@ -1050,7 +1055,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { @@ -1288,7 +1293,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) goto mpol_out; - vma = check_range(mm, start, end, nmask, + vma = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); err = PTR_ERR(vma); /* maybe ... */ From 0bf598d863e3c741d47e3178d645f04c9d6c186c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:16 -0700 Subject: [PATCH 103/303] mbind: add BUG_ON(!vma) in new_vma_page() new_vma_page() is called only by page migration called from do_mbind(), where pages to be migrated are queued into a pagelist by queue_pages_range(). queue_pages_range() confirms that a queued page belongs to some vma, so !vma case is not supposed to be happen. This patch adds BUG_ON() to catch this unexpected case. Signed-off-by: Naoya Horiguchi Reported-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9d778637b088..04729647f359 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1196,12 +1196,14 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * break; vma = vma->vm_next; } + /* + * queue_pages_range() confirms that @page belongs to some vma, + * so vma shouldn't be NULL. + */ + BUG_ON(!vma); if (PageHuge(page)) return alloc_huge_page_noerr(vma, address, 1); - /* - * if !vma, alloc_page_vma() will use task or system default policy - */ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); } #else From e76b63f80d938a1319eb5fb0ae7ea69bddfbae38 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Sep 2013 14:22:17 -0700 Subject: [PATCH 104/303] memblock, numa: binary search node id Current early_pfn_to_nid() on arch that support memblock go over memblock.memory one by one, so will take too many try near the end. We can use existing memblock_search to find the node id for given pfn, that could save some time on bigger system that have many entries memblock.memory array. Here are the timing differences for several machines. In each case with the patch less time was spent in __early_pfn_to_nid(). 3.11-rc5 with patch difference (%) -------- ---------- -------------- UV1: 256 nodes 9TB: 411.66 402.47 -9.19 (2.23%) UV2: 255 nodes 16TB: 1141.02 1138.12 -2.90 (0.25%) UV2: 64 nodes 2TB: 128.15 126.53 -1.62 (1.26%) UV2: 32 nodes 2TB: 121.87 121.07 -0.80 (0.66%) Time in seconds. Signed-off-by: Yinghai Lu Cc: Tejun Heo Acked-by: Russ Anderson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 2 ++ mm/memblock.c | 18 ++++++++++++++++++ mm/page_alloc.c | 19 +++++++++---------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index f388203db7e8..31e95acddb4d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -60,6 +60,8 @@ int memblock_reserve(phys_addr_t base, phys_addr_t size); void memblock_trim_memory(phys_addr_t align); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, + unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); diff --git a/mm/memblock.c b/mm/memblock.c index a847bfe6f3ba..0ac412a0a7ee 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -914,6 +914,24 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) return memblock_search(&memblock.memory, addr) != -1; } +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP +int __init_memblock memblock_search_pfn_nid(unsigned long pfn, + unsigned long *start_pfn, unsigned long *end_pfn) +{ + struct memblock_type *type = &memblock.memory; + int mid = memblock_search(type, (phys_addr_t)pfn << PAGE_SHIFT); + + if (mid == -1) + return -1; + + *start_pfn = type->regions[mid].base >> PAGE_SHIFT; + *end_pfn = (type->regions[mid].base + type->regions[mid].size) + >> PAGE_SHIFT; + + return type->regions[mid].nid; +} +#endif + /** * memblock_is_region_memory - check if a region is a subset of memory * @base: base of region to check diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f7cc08dad26a..22653e34a047 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4306,7 +4306,7 @@ int __meminit init_currently_empty_zone(struct zone *zone, int __meminit __early_pfn_to_nid(unsigned long pfn) { unsigned long start_pfn, end_pfn; - int i, nid; + int nid; /* * NOTE: The following SMP-unsafe globals are only used early in boot * when the kernel is running single-threaded. @@ -4317,15 +4317,14 @@ int __meminit __early_pfn_to_nid(unsigned long pfn) if (last_start_pfn <= pfn && pfn < last_end_pfn) return last_nid; - for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) - if (start_pfn <= pfn && pfn < end_pfn) { - last_start_pfn = start_pfn; - last_end_pfn = end_pfn; - last_nid = nid; - return nid; - } - /* This is a memory hole */ - return -1; + nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); + if (nid != -1) { + last_start_pfn = start_pfn; + last_end_pfn = end_pfn; + last_nid = nid; + } + + return nid; } #endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ From 90c7a79cc45becc6cdb8c026d55ace19e299a02d Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Wed, 11 Sep 2013 14:22:18 -0700 Subject: [PATCH 105/303] kmemcg: don't allocate extra memory for root memcg_cache_params The memcg_cache_params structure contains the common part and the union, which represents two different types of data: one for root cashes and another for child caches. The size of child data is fixed. The size of the memcg_caches array is calculated in runtime. Currently the size of memcg_cache_params for root caches is calculated incorrectly, because it includes the size of parameters for child caches. ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); size += sizeof(struct memcg_cache_params); v2: Fix a typo in calculations Signed-off-by: Andrey Vagin Cc: Glauber Costa Cc: Johannes Weiner Cc: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3b83957b6439..5ca1dcf77ce9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3121,7 +3121,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) ssize_t size = memcg_caches_array_size(num_groups); size *= sizeof(void *); - size += sizeof(struct memcg_cache_params); + size += offsetof(struct memcg_cache_params, memcg_caches); s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) { @@ -3164,13 +3164,16 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { - size_t size = sizeof(struct memcg_cache_params); + size_t size; if (!memcg_kmem_enabled()) return 0; - if (!memcg) + if (!memcg) { + size = offsetof(struct memcg_cache_params, memcg_caches); size += memcg_limited_groups_array_size * sizeof(void *); + } else + size = sizeof(struct memcg_cache_params); s->memcg_params = kzalloc(size, GFP_KERNEL); if (!s->memcg_params) From 3a7200af3d9227767869f451ed747aff07d8df48 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 11 Sep 2013 14:22:19 -0700 Subject: [PATCH 106/303] mm: compaction: do not compact pgdat for order-0 If kswapd was reclaiming for a high order and resets it to 0 due to fragmentation it will still call compact_pgdat. For the most part, this will fail a compaction_suitable() test and not compact but it is unnecessarily sloppy. It could be fixed in the caller but fix it in the API instead. [dhillf@gmail.com: pointed out that it was a potential problem] Signed-off-by: Mel Gorman Cc: Hillf Danton Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/compaction.c b/mm/compaction.c index 05ccb4cc0bdb..c43789388cd8 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1131,6 +1131,9 @@ void compact_pgdat(pg_data_t *pgdat, int order) .sync = false, }; + if (!order) + return; + __compact_pgdat(pgdat, &cc); } From 7cb2ef56e6a8b7b368b2e883a0a47d02fed66911 Mon Sep 17 00:00:00 2001 From: Khalid Aziz Date: Wed, 11 Sep 2013 14:22:20 -0700 Subject: [PATCH 107/303] mm: fix aio performance regression for database caused by THP I am working with a tool that simulates oracle database I/O workload. This tool (orion to be specific - ) allocates hugetlbfs pages using shmget() with SHM_HUGETLB flag. It then does aio into these pages from flash disks using various common block sizes used by database. I am looking at performance with two of the most common block sizes - 1M and 64K. aio performance with these two block sizes plunged after Transparent HugePages was introduced in the kernel. Here are performance numbers: pre-THP 2.6.39 3.11-rc5 1M read 8384 MB/s 5629 MB/s 6501 MB/s 64K read 7867 MB/s 4576 MB/s 4251 MB/s I have narrowed the performance impact down to the overheads introduced by THP in __get_page_tail() and put_compound_page() routines. perf top shows >40% of cycles being spent in these two routines. Every time direct I/O to hugetlbfs pages starts, kernel calls get_page() to grab a reference to the pages and calls put_page() when I/O completes to put the reference away. THP introduced significant amount of locking overhead to get_page() and put_page() when dealing with compound pages because hugepages can be split underneath get_page() and put_page(). It added this overhead irrespective of whether it is dealing with hugetlbfs pages or transparent hugepages. This resulted in 20%-45% drop in aio performance when using hugetlbfs pages. Since hugetlbfs pages can not be split, there is no reason to go through all the locking overhead for these pages from what I can see. I added code to __get_page_tail() and put_compound_page() to bypass all the locking code when working with hugetlbfs pages. This improved performance significantly. Performance numbers with this patch: pre-THP 3.11-rc5 3.11-rc5 + Patch 1M read 8384 MB/s 6501 MB/s 8371 MB/s 64K read 7867 MB/s 4251 MB/s 6510 MB/s Performance with 64K read is still lower than what it was before THP, but still a 53% improvement. It does mean there is more work to be done but I will take a 53% improvement for now. Please take a look at the following patch and let me know if it looks reasonable. [akpm@linux-foundation.org: tweak comments] Signed-off-by: Khalid Aziz Cc: Pravin B Shelar Cc: Christoph Lameter Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Mel Gorman Cc: Rik van Riel Cc: Minchan Kim Cc: Andi Kleen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 77 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 52 insertions(+), 25 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index 62b78a6e224f..c899502d3e36 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "internal.h" @@ -81,6 +82,19 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { + /* + * hugetlbfs pages cannot be split from under us. If this is a + * hugetlbfs page, check refcount on head page and release the page if + * the refcount becomes zero. + */ + if (PageHuge(page)) { + page = compound_head(page); + if (put_page_testzero(page)) + __put_compound_page(page); + + return; + } + if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); @@ -184,38 +198,51 @@ bool __get_page_tail(struct page *page) * proper PT lock that already serializes against * split_huge_page(). */ - unsigned long flags; bool got = false; - struct page *page_head = compound_trans_head(page); + struct page *page_head; - if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * If this is a hugetlbfs page it cannot be split under us. Simply + * increment refcount for the head page. + */ + if (PageHuge(page)) { + page_head = compound_head(page); + atomic_inc(&page_head->_count); + got = true; + } else { + unsigned long flags; - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head)) { + page_head = compound_trans_head(page); + if (likely(page != page_head && + get_page_unless_zero(page_head))) { + + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head)) { + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + return true; + } else { + put_page(page_head); + return false; + } + } + + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ if (likely(PageTail(page))) { __get_page_tail_foll(page, false); - return true; - } else { - put_page(page_head); - return false; + got = true; } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); } - - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); } return got; } From 47df3ddedd22c3f8e68aff831edb7921937674a2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Sep 2013 14:22:22 -0700 Subject: [PATCH 108/303] writeback: fix occasional slow sync(1) In case when system contains no dirty pages, wakeup_flusher_threads() will submit WB_SYNC_NONE writeback for 0 pages so wb_writeback() exits immediately without doing anything, even though there are dirty inodes in the system. Thus sync(1) will write all the dirty inodes from a WB_SYNC_ALL writeback pass which is slow. Fix the problem by using get_nr_dirty_pages() in wakeup_flusher_threads() instead of calculating number of dirty pages manually. That function also takes number of dirty inodes into account. Signed-off-by: Jan Kara Reported-by: Paul Taysom Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 68851ff2fd41..87d778118027 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1049,10 +1049,8 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) { struct backing_dev_info *bdi; - if (!nr_pages) { - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - } + if (!nr_pages) + nr_pages = get_nr_dirty_pages(); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { From 3b11f0aaae830f0f569cb8fb7fd26f4133ebdabd Mon Sep 17 00:00:00 2001 From: SeungHun Lee Date: Wed, 11 Sep 2013 14:22:23 -0700 Subject: [PATCH 109/303] mm: page_alloc: fix comment get_page_from_freelist cpuset_zone_allowed is changed to cpuset_zone_allowed_softwall and the comment is moved to __cpuset_node_allowed_softwall. So fix this comment. Signed-off-by: SeungHun Lee Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22653e34a047..7b1b706a1ffa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1896,7 +1896,7 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. - * See also cpuset_zone_allowed() comment in kernel/cpuset.c. + * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { From d9104d1ca9662498339c0de975b4666c30485f4e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 11 Sep 2013 14:22:24 -0700 Subject: [PATCH 110/303] mm: track vma changes with VM_SOFTDIRTY bit Pavel reported that in case if vma area get unmapped and then mapped (or expanded) in-place, the soft dirty tracker won't be able to recognize this situation since it works on pte level and ptes are get zapped on unmap, loosing soft dirty bit of course. So to resolve this situation we need to track actions on vma level, there VM_SOFTDIRTY flag comes in. When new vma area created (or old expanded) we set this bit, and keep it here until application calls for clearing soft dirty bit. Thus when user space application track memory changes now it can detect if vma area is renewed. Reported-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Cc: Andy Lutomirski Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Rob Landley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/soft-dirty.txt | 7 +++++ fs/exec.c | 2 +- fs/proc/task_mmu.c | 46 ++++++++++++++++++++++++++------- include/linux/mm.h | 6 +++++ mm/mmap.c | 12 ++++++++- 5 files changed, 61 insertions(+), 12 deletions(-) diff --git a/Documentation/vm/soft-dirty.txt b/Documentation/vm/soft-dirty.txt index 9a12a5956bc0..55684d11a1e8 100644 --- a/Documentation/vm/soft-dirty.txt +++ b/Documentation/vm/soft-dirty.txt @@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all the kernel does is finds this fact out and puts both writable and soft-dirty bits on the PTE. + While in most cases tracking memory changes by #PF-s is more than enough +there is still a scenario when we can lose soft dirty bits -- a task +unmaps a previously mapped memory region and then maps a new one at exactly +the same place. When unmap is called, the kernel internally clears PTE values +including soft dirty bits. To notify user space application about such +memory region renewal the kernel always marks new memory regions (and +expanded regions) as soft dirty. This feature is actively used by the checkpoint-restore project. You can find more details about it on http://criu.org diff --git a/fs/exec.c b/fs/exec.c index fd774c7cb483..2d1e52a58fe9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; + vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); INIT_LIST_HEAD(&vma->anon_vma_chain); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 107d026f5d6e..09228639b83d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_file_clear_soft_dirty(ptent); } + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, if (is_migration_entry(entry)) page = migration_entry_to_page(entry); } else { - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); + if (vma->vm_flags & VM_SOFTDIRTY) + flags2 |= __PM_SOFT_DIRTY; + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); return; } if (page && !PageAnon(page)) flags |= PM_FILE; - if (pte_soft_dirty(pte)) + if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) flags2 |= __PM_SOFT_DIRTY; *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); @@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2)); } #else static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, @@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { int pmd_flags2; - pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); + if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) + pmd_flags2 = __PM_SOFT_DIRTY; + else + pmd_flags2 = 0; + for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; @@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) return 0; for (; addr != end; addr += PAGE_SIZE) { + int flags2; /* check to see if we've left 'vma' behind * and need a new, higher one */ if (vma && (addr >= vma->vm_end)) { vma = find_vma(walk->mm, addr); - pme = make_pme(PM_NOT_PRESENT(pm->v2)); + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; + pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); } /* check that 'vma' actually covers this address, @@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, #ifdef CONFIG_HUGETLB_PAGE static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, - pte_t pte, int offset) + pte_t pte, int offset, int flags2) { if (pte_present(pte)) - *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) - | PM_STATUS2(pm->v2, 0) | PM_PRESENT); + *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) | + PM_STATUS2(pm->v2, flags2) | + PM_PRESENT); else - *pme = make_pme(PM_NOT_PRESENT(pm->v2)); + *pme = make_pme(PM_NOT_PRESENT(pm->v2) | + PM_STATUS2(pm->v2, flags2)); } /* This function walks within one hugetlb entry in the single call */ @@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { struct pagemapread *pm = walk->private; + struct vm_area_struct *vma; int err = 0; + int flags2; pagemap_entry_t pme; + vma = find_vma(walk->mm, addr); + WARN_ON_ONCE(!vma); + + if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + flags2 = __PM_SOFT_DIRTY; + else + flags2 = 0; + for (; addr != end; addr += PAGE_SIZE) { int offset = (addr & ~hmask) >> PAGE_SHIFT; - huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); + huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2); err = add_to_pagemap(addr, &pme, pm); if (err) return err; diff --git a/include/linux/mm.h b/include/linux/mm.h index d2d59b4149d0..dce24569f8fc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp); #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ +#ifdef CONFIG_MEM_SOFT_DIRTY +# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ +#else +# define VM_SOFTDIRTY 0 +#endif + #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ diff --git a/mm/mmap.c b/mm/mmap.c index 13926a5a6901..51958d192a48 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1609,6 +1609,15 @@ out: if (file) uprobe_mmap(vma); + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + return addr; unmap_and_free_vma: @@ -2652,6 +2661,7 @@ out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); + vma->vm_flags |= VM_SOFTDIRTY; return addr; } @@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; + vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_ops = &special_mapping_vmops; From 0ec3b74c7f5599c8a4d2b33d430a5470af26ebf6 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:26 -0700 Subject: [PATCH 111/303] mm: putback_lru_page: remove unnecessary call to page_lru_base_type() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal of this patch series is to improve performance of munlock() of large mlocked memory areas on systems without THP. This is motivated by reported very long times of crash recovery of processes with such areas, where munlock() can take several seconds. See http://lwn.net/Articles/548108/ The work was driven by a simple benchmark (to be included in mmtests) that mmaps() e.g. 56GB with MAP_LOCKED | MAP_POPULATE and measures the time of munlock(). Profiling was performed by attaching operf --pid to the process and sending a signal to trigger the munlock() part and then notify bach the monitoring wrapper to stop operf, so that only munlock() appears in the profile. The profiles have shown that CPU time is spent mostly by atomic operations and repeated locking per single pages. This series aims to reduce both, starting from simpler to more complex changes. Patch 1 performs a simple cleanup in putback_lru_page() so that page lru base type is not determined without being actually needed. Patch 2 removes an unnecessary call to lru_add_drain() which drains the per-cpu pagevec after each munlocked page is put there. Patch 3 changes munlock_vma_range() to use an on-stack pagevec for isolating multiple non-THP pages under a single lru_lock instead of locking and processing each page separately. Patch 4 changes the NR_MLOCK accounting to be called only once per the pvec introduced by previous patch. Patch 5 uses the introduced pagevec to batch also the work of putback_lru_page when possible, bypassing the per-cpu pvec and associated overhead. Patch 6 removes a redundant get_page/put_page pair which saves costly atomic operations. Patch 7 avoids calling follow_page_mask() on each individual page, and obtains multiple page references under a single page table lock where possible. Measurements were made using 3.11-rc3 as a baseline. The first set of measurements shows the possibly ideal conditions where batching should help the most. All memory is allocated from a single NUMA node and THP is disabled. timedmunlock 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 0 1 2 3 4 5 6 7 Elapsed min 3.38 ( 0.00%) 3.39 ( -0.13%) 3.00 ( 11.33%) 2.70 ( 20.20%) 2.67 ( 21.11%) 2.37 ( 29.88%) 2.20 ( 34.91%) 1.91 ( 43.59%) Elapsed mean 3.39 ( 0.00%) 3.40 ( -0.23%) 3.01 ( 11.33%) 2.70 ( 20.26%) 2.67 ( 21.21%) 2.38 ( 29.88%) 2.21 ( 34.93%) 1.92 ( 43.46%) Elapsed stddev 0.01 ( 0.00%) 0.01 (-43.09%) 0.01 ( 15.42%) 0.01 ( 23.42%) 0.00 ( 89.78%) 0.01 ( -7.15%) 0.00 ( 76.69%) 0.02 (-91.77%) Elapsed max 3.41 ( 0.00%) 3.43 ( -0.52%) 3.03 ( 11.29%) 2.72 ( 20.16%) 2.67 ( 21.63%) 2.40 ( 29.50%) 2.21 ( 35.21%) 1.96 ( 42.39%) Elapsed range 0.03 ( 0.00%) 0.04 (-51.16%) 0.02 ( 6.27%) 0.02 ( 14.67%) 0.00 ( 88.90%) 0.03 (-19.18%) 0.01 ( 73.70%) 0.06 (-113.35% The second set of measurements simulates the worst possible conditions for batching by using numactl --interleave, so that there is in fact only one page per pagevec. Even in this case the series seems to improve performance thanks to reduced atomic operations and removal of lru_add_drain(). timedmunlock 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 0 1 2 3 4 5 6 7 Elapsed min 4.00 ( 0.00%) 4.04 ( -0.93%) 3.87 ( 3.37%) 3.72 ( 6.94%) 3.81 ( 4.72%) 3.69 ( 7.82%) 3.64 ( 8.92%) 3.41 ( 14.81%) Elapsed mean 4.17 ( 0.00%) 4.15 ( 0.51%) 4.03 ( 3.49%) 3.89 ( 6.84%) 3.86 ( 7.48%) 3.89 ( 6.69%) 3.70 ( 11.27%) 3.48 ( 16.59%) Elapsed stddev 0.16 ( 0.00%) 0.08 ( 50.76%) 0.10 ( 41.58%) 0.16 ( 4.59%) 0.05 ( 72.38%) 0.19 (-12.91%) 0.05 ( 68.09%) 0.06 ( 66.03%) Elapsed max 4.34 ( 0.00%) 4.32 ( 0.56%) 4.19 ( 3.62%) 4.12 ( 5.15%) 3.91 ( 9.88%) 4.12 ( 5.25%) 3.80 ( 12.58%) 3.56 ( 18.08%) Elapsed range 0.34 ( 0.00%) 0.28 ( 17.91%) 0.32 ( 6.45%) 0.40 (-15.73%) 0.10 ( 70.06%) 0.43 (-24.84%) 0.15 ( 55.32%) 0.15 ( 56.16%) For completeness, a third set of measurements shows the situation where THP is enabled and allocations are again done on a single NUMA node. Here munlock() is already very fast thanks to huge pages, and this series does not compromise that performance. It seems that the removal of call to lru_add_drain() still helps a bit. timedmunlock 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 3.11-rc3 0 1 2 3 4 5 6 7 Elapsed min 0.01 ( 0.00%) 0.01 ( -0.11%) 0.01 ( 6.59%) 0.01 ( 5.41%) 0.01 ( 5.45%) 0.01 ( 5.03%) 0.01 ( 6.08%) 0.01 ( 5.20%) Elapsed mean 0.01 ( 0.00%) 0.01 ( -0.27%) 0.01 ( 6.39%) 0.01 ( 5.30%) 0.01 ( 5.32%) 0.01 ( 5.03%) 0.01 ( 5.97%) 0.01 ( 5.22%) Elapsed stddev 0.00 ( 0.00%) 0.00 ( -9.59%) 0.00 ( 10.77%) 0.00 ( 3.24%) 0.00 ( 24.42%) 0.00 ( 31.86%) 0.00 ( -7.46%) 0.00 ( 6.11%) Elapsed max 0.01 ( 0.00%) 0.01 ( -0.01%) 0.01 ( 6.83%) 0.01 ( 5.42%) 0.01 ( 5.79%) 0.01 ( 5.53%) 0.01 ( 6.08%) 0.01 ( 5.26%) Elapsed range 0.00 ( 0.00%) 0.00 ( 7.30%) 0.00 ( 24.38%) 0.00 ( 6.10%) 0.00 ( 30.79%) 0.00 ( 42.52%) 0.00 ( 6.11%) 0.00 ( 10.07%) This patch (of 7): In putback_lru_page() since commit c53954a092 (""mm: remove lru parameter from __lru_cache_add and lru_cache_add_lru") it is no longer needed to determine lru list via page_lru_base_type(). This patch replaces it with simple flag is_unevictable which says that the page was put on the inevictable list. This is the only information that matters in subsequent tests. Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 758540d3ca83..44c072a7cba2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -545,7 +545,7 @@ int remove_mapping(struct address_space *mapping, struct page *page) */ void putback_lru_page(struct page *page) { - int lru; + bool is_unevictable; int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -560,14 +560,14 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = page_lru_base_type(page); + is_unevictable = false; lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing @@ -587,7 +587,7 @@ redo: * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -598,9 +598,9 @@ redo: */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ From 586a32ac1d33ce7a7548a27e4087e98842c3a06f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:27 -0700 Subject: [PATCH 112/303] mm: munlock: remove unnecessary call to lru_add_drain() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In munlock_vma_range(), lru_add_drain() is currently called in a loop before each munlock_vma_page() call. This is suboptimal for performance when munlocking many pages. The benefits of per-cpu pagevec for batching the LRU putback are removed since the pagevec only holds at most one page from the previous loop's iteration. The lru_add_drain() call also does not serve any purposes for correctness - it does not even drain pagavecs of all cpu's. The munlock code already expects and handles situations where a page cannot be isolated from the LRU (e.g. because it is on some per-cpu pagevec). The history of the (not commented) call also suggest that it appears there as an oversight rather than intentionally. Before commit ff6a6da6 ("mm: accelerate munlock() treatment of THP pages") the call happened only once upon entering the function. The commit has moved the call into the while loope. So while the other changes in the commit improved munlock performance for THP pages, it introduced the abovementioned suboptimal per-cpu pagevec usage. Further in history, before commit 408e82b7 ("mm: munlock use follow_page"), munlock_vma_pages_range() was just a wrapper around __mlock_vma_pages_range which performed both mlock and munlock depending on a flag. However, before ba470de4 ("mmap: handle mlocked pages during map, remap, unmap") the function handled only mlock, not munlock. The lru_add_drain call thus comes from the implementation in commit b291f000 ("mlock: mlocked pages are unevictable" and was intended only for mlocking, not munlocking. The original intention of draining the LRU pagevec at mlock time was to ensure the pages were on the LRU before the lock operation so that they could be placed on the unevictable list immediately. There is very little motivation to do the same in the munlock path this, particularly for every single page. This patch therefore removes the call completely. After removing the call, a 10% speedup was measured for munlock() of a 56GB large memory area with THP disabled. Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mlock.c b/mm/mlock.c index 79b7cf7d1bca..b85f1e827610 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -247,7 +247,6 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, &page_mask); if (page && !IS_ERR(page)) { lock_page(page); - lru_add_drain(); /* * Any THP page found by follow_page_mask() may have * gotten split before reaching munlock_vma_page(), From 7225522bb429a2f7dae6667e533e2d735b4882d0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:29 -0700 Subject: [PATCH 113/303] mm: munlock: batch non-THP page isolation and munlock+putback using pagevec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, munlock_vma_range() calls munlock_vma_page on each page in a loop, which results in repeated taking and releasing of the lru_lock spinlock for isolating pages one by one. This patch batches the munlock operations using an on-stack pagevec, so that isolation is done under single lru_lock. For THP pages, the old behavior is preserved as they might be split while putting them into the pagevec. After this patch, a 9% speedup was measured for munlocking a 56GB large memory area with THP disabled. A new function __munlock_pagevec() is introduced that takes a pagevec and: 1) It clears PageMlocked and isolates all pages under lru_lock. Zone page stats can be also updated using the variant which assumes disabled interrupts. 2) It finishes the munlock and lru putback on all pages under their lock_page. Note that previously, lock_page covered also the PageMlocked clearing and page isolation, but it is not needed for those operations. Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 196 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 156 insertions(+), 40 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b85f1e827610..b3b4a78b7802 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -18,6 +19,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page) } } +/* + * Finish munlock after successful page isolation + * + * Page must be locked. This is a wrapper for try_to_munlock() + * and putback_lru_page() with munlock accounting. + */ +static void __munlock_isolated_page(struct page *page) +{ + int ret = SWAP_AGAIN; + + /* + * Optimization: if the page was mapped just once, that's our mapping + * and we don't need to check all the other vmas. + */ + if (page_mapcount(page) > 1) + ret = try_to_munlock(page); + + /* Did try_to_unlock() succeed or punt? */ + if (ret != SWAP_MLOCK) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); +} + +/* + * Accounting for page isolation fail during munlock + * + * Performs accounting when page isolation fails in munlock. There is nothing + * else to do because it means some other task has already removed the page + * from the LRU. putback_lru_page() will take care of removing the page from + * the unevictable list, if necessary. vmscan [page_referenced()] will move + * the page back to the unevictable list if some other vma has it mlocked. + */ +static void __munlock_isolation_failed(struct page *page) +{ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); +} + /** * munlock_vma_page - munlock a vma page * @page - page to be unlocked @@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page) unsigned int nr_pages = hpage_nr_pages(page); mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); page_mask = nr_pages - 1; - if (!isolate_lru_page(page)) { - int ret = SWAP_AGAIN; - - /* - * Optimization: if the page was mapped just once, - * that's our mapping and we don't need to check all the - * other vmas. - */ - if (page_mapcount(page) > 1) - ret = try_to_munlock(page); - /* - * did try_to_unlock() succeed or punt? - */ - if (ret != SWAP_MLOCK) - count_vm_event(UNEVICTABLE_PGMUNLOCKED); - - putback_lru_page(page); - } else { - /* - * Some other task has removed the page from the LRU. - * putback_lru_page() will take care of removing the - * page from the unevictable list, if necessary. - * vmscan [page_referenced()] will move the page back - * to the unevictable list if some other vma has it - * mlocked. - */ - if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); - else - count_vm_event(UNEVICTABLE_PGMUNLOCKED); - } + if (!isolate_lru_page(page)) + __munlock_isolated_page(page); + else + __munlock_isolation_failed(page); } return page_mask; @@ -209,6 +226,73 @@ static int __mlock_posix_error_return(long retval) return retval; } +/* + * Munlock a batch of pages from the same zone + * + * The work is split to two main phases. First phase clears the Mlocked flag + * and attempts to isolate the pages, all under a single zone lru lock. + * The second phase finishes the munlock only for pages where isolation + * succeeded. + * + * Note that pvec is modified during the process. Before returning + * pagevec_reinit() is called on it. + */ +static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) +{ + int i; + int nr = pagevec_count(pvec); + + /* Phase 1: page isolation */ + spin_lock_irq(&zone->lru_lock); + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (TestClearPageMlocked(page)) { + struct lruvec *lruvec; + int lru; + + /* we have disabled interrupts */ + __mod_zone_page_state(zone, NR_MLOCK, -1); + + if (PageLRU(page)) { + lruvec = mem_cgroup_page_lruvec(page, zone); + lru = page_lru(page); + + get_page(page); + ClearPageLRU(page); + del_page_from_lru_list(page, lruvec, lru); + } else { + __munlock_isolation_failed(page); + goto skip_munlock; + } + + } else { +skip_munlock: + /* + * We won't be munlocking this page in the next phase + * but we still need to release the follow_page_mask() + * pin. + */ + pvec->pages[i] = NULL; + put_page(page); + } + } + spin_unlock_irq(&zone->lru_lock); + + /* Phase 2: page munlock and putback */ + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (page) { + lock_page(page); + __munlock_isolated_page(page); + unlock_page(page); + put_page(page); /* pin from follow_page_mask() */ + } + } + pagevec_reinit(pvec); +} + /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -230,11 +314,16 @@ static int __mlock_posix_error_return(long retval) void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { + struct pagevec pvec; + struct zone *zone = NULL; + + pagevec_init(&pvec, 0); vma->vm_flags &= ~VM_LOCKED; while (start < end) { struct page *page; unsigned int page_mask, page_increm; + struct zone *pagezone; /* * Although FOLL_DUMP is intended for get_dump_page(), @@ -246,20 +335,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, &page_mask); if (page && !IS_ERR(page)) { - lock_page(page); - /* - * Any THP page found by follow_page_mask() may have - * gotten split before reaching munlock_vma_page(), - * so we need to recompute the page_mask here. - */ - page_mask = munlock_vma_page(page); - unlock_page(page); - put_page(page); + pagezone = page_zone(page); + /* The whole pagevec must be in the same zone */ + if (pagezone != zone) { + if (pagevec_count(&pvec)) + __munlock_pagevec(&pvec, zone); + zone = pagezone; + } + if (PageTransHuge(page)) { + /* + * THP pages are not handled by pagevec due + * to their possible split (see below). + */ + if (pagevec_count(&pvec)) + __munlock_pagevec(&pvec, zone); + lock_page(page); + /* + * Any THP page found by follow_page_mask() may + * have gotten split before reaching + * munlock_vma_page(), so we need to recompute + * the page_mask here. + */ + page_mask = munlock_vma_page(page); + unlock_page(page); + put_page(page); /* follow_page_mask() */ + } else { + /* + * Non-huge pages are handled in batches + * via pagevec. The pin from + * follow_page_mask() prevents them from + * collapsing by THP. + */ + if (pagevec_add(&pvec, page) == 0) + __munlock_pagevec(&pvec, zone); + } } page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); start += page_increm * PAGE_SIZE; cond_resched(); } + if (pagevec_count(&pvec)) + __munlock_pagevec(&pvec, zone); } /* From 1ebb7cc6a58321a4b22c4c9097b4651b0ab859d0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:30 -0700 Subject: [PATCH 114/303] mm: munlock: batch NR_MLOCK zone state updates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depending on previous batch which introduced batched isolation in munlock_vma_range(), we can batch also the updates of NR_MLOCK page stats. After the whole pagevec is processed for page isolation, the stats are updated only once with the number of successful isolations. There were however no measurable perfomance gains. Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b3b4a78b7802..b1a7c8007c89 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -241,6 +241,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { int i; int nr = pagevec_count(pvec); + int delta_munlocked = -nr; /* Phase 1: page isolation */ spin_lock_irq(&zone->lru_lock); @@ -251,9 +252,6 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) struct lruvec *lruvec; int lru; - /* we have disabled interrupts */ - __mod_zone_page_state(zone, NR_MLOCK, -1); - if (PageLRU(page)) { lruvec = mem_cgroup_page_lruvec(page, zone); lru = page_lru(page); @@ -275,8 +273,10 @@ skip_munlock: */ pvec->pages[i] = NULL; put_page(page); + delta_munlocked++; } } + __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(&zone->lru_lock); /* Phase 2: page munlock and putback */ From 56afe477df3cbbcd656682d0355ef7d9eb8bdd81 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:32 -0700 Subject: [PATCH 115/303] mm: munlock: bypass per-cpu pvec for putback_lru_page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After introducing batching by pagevecs into munlock_vma_range(), we can further improve performance by bypassing the copying into per-cpu pagevec and the get_page/put_page pair associated with that. Instead we perform LRU putback directly from our pagevec. However, this is possible only for single-mapped pages that are evictable after munlock. Unevictable pages require rechecking after putting on the unevictable list, so for those we fallback to putback_lru_page(), hich handles that. After this patch, a 13% speedup was measured for munlocking a 56GB large memory area with THP disabled. [akpm@linux-foundation.org:clarify comment] Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b1a7c8007c89..abdc612b042d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -226,6 +226,52 @@ static int __mlock_posix_error_return(long retval) return retval; } +/* + * Prepare page for fast batched LRU putback via putback_lru_evictable_pagevec() + * + * The fast path is available only for evictable pages with single mapping. + * Then we can bypass the per-cpu pvec and get better performance. + * when mapcount > 1 we need try_to_munlock() which can fail. + * when !page_evictable(), we need the full redo logic of putback_lru_page to + * avoid leaving evictable page in unevictable list. + * + * In case of success, @page is added to @pvec and @pgrescued is incremented + * in case that the page was previously unevictable. @page is also unlocked. + */ +static bool __putback_lru_fast_prepare(struct page *page, struct pagevec *pvec, + int *pgrescued) +{ + VM_BUG_ON(PageLRU(page)); + VM_BUG_ON(!PageLocked(page)); + + if (page_mapcount(page) <= 1 && page_evictable(page)) { + pagevec_add(pvec, page); + if (TestClearPageUnevictable(page)) + (*pgrescued)++; + unlock_page(page); + return true; + } + + return false; +} + +/* + * Putback multiple evictable pages to the LRU + * + * Batched putback of evictable pages that bypasses the per-cpu pvec. Some of + * the pages might have meanwhile become unevictable but that is OK. + */ +static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) +{ + count_vm_events(UNEVICTABLE_PGMUNLOCKED, pagevec_count(pvec)); + /* + *__pagevec_lru_add() calls release_pages() so we don't call + * put_page() explicitly + */ + __pagevec_lru_add(pvec); + count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued); +} + /* * Munlock a batch of pages from the same zone * @@ -242,6 +288,8 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) int i; int nr = pagevec_count(pvec); int delta_munlocked = -nr; + struct pagevec pvec_putback; + int pgrescued = 0; /* Phase 1: page isolation */ spin_lock_irq(&zone->lru_lock); @@ -279,17 +327,34 @@ skip_munlock: __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked); spin_unlock_irq(&zone->lru_lock); - /* Phase 2: page munlock and putback */ + /* Phase 2: page munlock */ + pagevec_init(&pvec_putback, 0); for (i = 0; i < nr; i++) { struct page *page = pvec->pages[i]; if (page) { lock_page(page); - __munlock_isolated_page(page); - unlock_page(page); - put_page(page); /* pin from follow_page_mask() */ + if (!__putback_lru_fast_prepare(page, &pvec_putback, + &pgrescued)) { + /* Slow path */ + __munlock_isolated_page(page); + unlock_page(page); + } } } + + /* Phase 3: page putback for pages that qualified for the fast path */ + if (pagevec_count(&pvec_putback)) + __putback_lru_fast(&pvec_putback, pgrescued); + + /* Phase 4: put_page to return pin from follow_page_mask() */ + for (i = 0; i < nr; i++) { + struct page *page = pvec->pages[i]; + + if (page) + put_page(page); + } + pagevec_reinit(pvec); } From 5b40998ae35cf64561868370e6c9f3d3e94b6bf7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:33 -0700 Subject: [PATCH 116/303] mm: munlock: remove redundant get_page/put_page pair on the fast path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The performance of the fast path in munlock_vma_range() can be further improved by avoiding atomic ops of a redundant get_page()/put_page() pair. When calling get_page() during page isolation, we already have the pin from follow_page_mask(). This pin will be then returned by __pagevec_lru_add(), after which we do not reference the pages anymore. After this patch, an 8% speedup was measured for munlocking a 56GB large memory area with THP disabled. Signed-off-by: Vlastimil Babka Reviewed-by: Jörn Engel Acked-by: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index abdc612b042d..19a934dce5d6 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -303,8 +303,10 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) if (PageLRU(page)) { lruvec = mem_cgroup_page_lruvec(page, zone); lru = page_lru(page); - - get_page(page); + /* + * We already have pin from follow_page_mask() + * so we can spare the get_page() here. + */ ClearPageLRU(page); del_page_from_lru_list(page, lruvec, lru); } else { @@ -336,25 +338,25 @@ skip_munlock: lock_page(page); if (!__putback_lru_fast_prepare(page, &pvec_putback, &pgrescued)) { - /* Slow path */ + /* + * Slow path. We don't want to lose the last + * pin before unlock_page() + */ + get_page(page); /* for putback_lru_page() */ __munlock_isolated_page(page); unlock_page(page); + put_page(page); /* from follow_page_mask() */ } } } - /* Phase 3: page putback for pages that qualified for the fast path */ + /* + * Phase 3: page putback for pages that qualified for the fast path + * This will also call put_page() to return pin from follow_page_mask() + */ if (pagevec_count(&pvec_putback)) __putback_lru_fast(&pvec_putback, pgrescued); - /* Phase 4: put_page to return pin from follow_page_mask() */ - for (i = 0; i < nr; i++) { - struct page *page = pvec->pages[i]; - - if (page) - put_page(page); - } - pagevec_reinit(pvec); } From 7a8010cd36273ff5f6fea5201ef9232f30cebbd9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 11 Sep 2013 14:22:35 -0700 Subject: [PATCH 117/303] mm: munlock: manual pte walk in fast path instead of follow_page_mask() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently munlock_vma_pages_range() calls follow_page_mask() to obtain each individual struct page. This entails repeated full page table translations and page table lock taken for each page separately. This patch avoids the costly follow_page_mask() where possible, by iterating over ptes within single pmd under single page table lock. The first pte is obtained by get_locked_pte() for non-THP page acquired by the initial follow_page_mask(). The rest of the on-stack pagevec for munlock is filled up using pte_walk as long as pte_present() and vm_normal_page() are sufficient to obtain the struct page. After this patch, a 14% speedup was measured for munlocking a 56GB large memory area with THP disabled. Signed-off-by: Vlastimil Babka Cc: Jörn Engel Cc: Mel Gorman Cc: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ++--- mm/mlock.c | 110 ++++++++++++++++++++++++++++++++------------- 2 files changed, 85 insertions(+), 37 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dce24569f8fc..03f84b8d7359 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -643,12 +643,12 @@ static inline enum zone_type page_zonenum(const struct page *page) #endif /* - * The identification function is only used by the buddy allocator for - * determining if two pages could be buddies. We are not really - * identifying a zone since we could be using a the section number - * id if we have not node id available in page flags. - * We guarantee only that it will return the same value for two - * combinable pages in a zone. + * The identification function is mainly used by the buddy allocator for + * determining if two pages could be buddies. We are not really identifying + * the zone since we could be using the section number id if we do not have + * node id available in page flags. + * We only guarantee that it will return the same value for two combinable + * pages in a zone. */ static inline int page_zone_id(struct page *page) { diff --git a/mm/mlock.c b/mm/mlock.c index 19a934dce5d6..d63802663242 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -280,8 +280,7 @@ static void __putback_lru_fast(struct pagevec *pvec, int pgrescued) * The second phase finishes the munlock only for pages where isolation * succeeded. * - * Note that pvec is modified during the process. Before returning - * pagevec_reinit() is called on it. + * Note that the pagevec may be modified during the process. */ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) { @@ -356,8 +355,60 @@ skip_munlock: */ if (pagevec_count(&pvec_putback)) __putback_lru_fast(&pvec_putback, pgrescued); +} - pagevec_reinit(pvec); +/* + * Fill up pagevec for __munlock_pagevec using pte walk + * + * The function expects that the struct page corresponding to @start address is + * a non-TPH page already pinned and in the @pvec, and that it belongs to @zone. + * + * The rest of @pvec is filled by subsequent pages within the same pmd and same + * zone, as long as the pte's are present and vm_normal_page() succeeds. These + * pages also get pinned. + * + * Returns the address of the next page that should be scanned. This equals + * @start + PAGE_SIZE when no page could be added by the pte walk. + */ +static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, + struct vm_area_struct *vma, int zoneid, unsigned long start, + unsigned long end) +{ + pte_t *pte; + spinlock_t *ptl; + + /* + * Initialize pte walk starting at the already pinned page where we + * are sure that there is a pte. + */ + pte = get_locked_pte(vma->vm_mm, start, &ptl); + end = min(end, pmd_addr_end(start, end)); + + /* The page next to the pinned page is the first we will try to get */ + start += PAGE_SIZE; + while (start < end) { + struct page *page = NULL; + pte++; + if (pte_present(*pte)) + page = vm_normal_page(vma, start, *pte); + /* + * Break if page could not be obtained or the page's node+zone does not + * match + */ + if (!page || page_zone_id(page) != zoneid) + break; + + get_page(page); + /* + * Increase the address that will be returned *before* the + * eventual break due to pvec becoming full by adding the page + */ + start += PAGE_SIZE; + if (pagevec_add(pvec, page) == 0) + break; + } + pte_unmap_unlock(pte, ptl); + return start; } /* @@ -381,17 +432,16 @@ skip_munlock: void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - struct pagevec pvec; - struct zone *zone = NULL; - - pagevec_init(&pvec, 0); vma->vm_flags &= ~VM_LOCKED; while (start < end) { - struct page *page; + struct page *page = NULL; unsigned int page_mask, page_increm; - struct zone *pagezone; + struct pagevec pvec; + struct zone *zone; + int zoneid; + pagevec_init(&pvec, 0); /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the @@ -400,22 +450,10 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, * has sneaked into the range, we won't oops here: great). */ page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP, - &page_mask); + &page_mask); + if (page && !IS_ERR(page)) { - pagezone = page_zone(page); - /* The whole pagevec must be in the same zone */ - if (pagezone != zone) { - if (pagevec_count(&pvec)) - __munlock_pagevec(&pvec, zone); - zone = pagezone; - } if (PageTransHuge(page)) { - /* - * THP pages are not handled by pagevec due - * to their possible split (see below). - */ - if (pagevec_count(&pvec)) - __munlock_pagevec(&pvec, zone); lock_page(page); /* * Any THP page found by follow_page_mask() may @@ -428,21 +466,31 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, put_page(page); /* follow_page_mask() */ } else { /* - * Non-huge pages are handled in batches - * via pagevec. The pin from - * follow_page_mask() prevents them from - * collapsing by THP. + * Non-huge pages are handled in batches via + * pagevec. The pin from follow_page_mask() + * prevents them from collapsing by THP. */ - if (pagevec_add(&pvec, page) == 0) - __munlock_pagevec(&pvec, zone); + pagevec_add(&pvec, page); + zone = page_zone(page); + zoneid = page_zone_id(page); + + /* + * Try to fill the rest of pagevec using fast + * pte walk. This will also update start to + * the next page to process. Then munlock the + * pagevec. + */ + start = __munlock_pagevec_fill(&pvec, vma, + zoneid, start, end); + __munlock_pagevec(&pvec, zone); + goto next; } } page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); start += page_increm * PAGE_SIZE; +next: cond_resched(); } - if (pagevec_count(&pvec)) - __munlock_pagevec(&pvec, zone); } /* From 6e543d5780e36ff5ee56c44d7e2e30db3457a7ed Mon Sep 17 00:00:00 2001 From: Lisa Du Date: Wed, 11 Sep 2013 14:22:36 -0700 Subject: [PATCH 118/303] mm: vmscan: fix do_try_to_free_pages() livelock This patch is based on KOSAKI's work and I add a little more description, please refer https://lkml.org/lkml/2012/6/14/74. Currently, I found system can enter a state that there are lots of free pages in a zone but only order-0 and order-1 pages which means the zone is heavily fragmented, then high order allocation could make direct reclaim path's long stall(ex, 60 seconds) especially in no swap and no compaciton enviroment. This problem happened on v3.4, but it seems issue still lives in current tree, the reason is do_try_to_free_pages enter live lock: kswapd will go to sleep if the zones have been fully scanned and are still not balanced. As kswapd thinks there's little point trying all over again to avoid infinite loop. Instead it changes order from high-order to 0-order because kswapd think order-0 is the most important. Look at 73ce02e9 in detail. If watermarks are ok, kswapd will go back to sleep and may leave zone->all_unreclaimable =3D 0. It assume high-order users can still perform direct reclaim if they wish. Direct reclaim continue to reclaim for a high order which is not a COSTLY_ORDER without oom-killer until kswapd turn on zone->all_unreclaimble= . This is because to avoid too early oom-kill. So it means direct_reclaim depends on kswapd to break this loop. In worst case, direct-reclaim may continue to page reclaim forever when kswapd sleeps forever until someone like watchdog detect and finally kill the process. As described in: http://thread.gmane.org/gmane.linux.kernel.mm/103737 We can't turn on zone->all_unreclaimable from direct reclaim path because direct reclaim path don't take any lock and this way is racy. Thus this patch removes zone->all_unreclaimable field completely and recalculates zone reclaimable state every time. Note: we can't take the idea that direct-reclaim see zone->pages_scanned directly and kswapd continue to use zone->all_unreclaimable. Because, it is racy. commit 929bea7c71 (vmscan: all_unreclaimable() use zone->all_unreclaimable as a name) describes the detail. [akpm@linux-foundation.org: uninline zone_reclaimable_pages() and zone_reclaimable()] Cc: Aaditya Kumar Cc: Ying Han Cc: Nick Piggin Acked-by: Rik van Riel Cc: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Lameter Cc: Bob Liu Cc: Neil Zhang Cc: Russell King - ARM Linux Reviewed-by: Michal Hocko Acked-by: Minchan Kim Acked-by: Johannes Weiner Signed-off-by: KOSAKI Motohiro Signed-off-by: Lisa Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 1 + include/linux/mmzone.h | 1 - include/linux/vmstat.h | 1 - mm/internal.h | 2 ++ mm/migrate.c | 2 +- mm/page-writeback.c | 3 ++ mm/page_alloc.c | 5 ++- mm/vmscan.c | 66 ++++++++++++++++++--------------------- mm/vmstat.c | 5 ++- 9 files changed, 44 insertions(+), 42 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 1397ccf81e91..cf55945c83fb 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,6 +2,7 @@ #define LINUX_MM_INLINE_H #include +#include /** * page_is_file_cache - should the page be on a file LRU or anon LRU? diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ac1ea796ec0f..bd791e452ad7 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -353,7 +353,6 @@ struct zone { * free areas of different sizes */ spinlock_t lock; - int all_unreclaimable; /* All pages pinned */ #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 502767f4e4d4..e4b948080d20 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -143,7 +143,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, } extern unsigned long global_reclaimable_pages(void); -extern unsigned long zone_reclaimable_pages(struct zone *zone); #ifdef CONFIG_NUMA /* diff --git a/mm/internal.h b/mm/internal.h index 4390ac6c106e..684f7aa9692a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -85,6 +85,8 @@ extern unsigned long highest_memmap_pfn; */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); +extern unsigned long zone_reclaimable_pages(struct zone *zone); +extern bool zone_reclaimable(struct zone *zone); /* * in mm/rmap.c: diff --git a/mm/migrate.c b/mm/migrate.c index 61f14a1923fd..b7ded7eafe3a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1471,7 +1471,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) continue; /* Avoid waking kswapd by allocating pages_to_migrate pages. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d374b29296dd..3750431b3cd8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -36,8 +36,11 @@ #include #include #include +#include #include +#include "internal.h" + /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7b1b706a1ffa..ff2782576e39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -647,7 +648,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, int to_free = count; spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; while (to_free) { @@ -696,7 +696,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone->all_unreclaimable = 0; zone->pages_scanned = 0; __free_one_page(page, zone, order, migratetype); @@ -3164,7 +3163,7 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone->all_unreclaimable ? "yes" : "no") + (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) diff --git a/mm/vmscan.c b/mm/vmscan.c index 44c072a7cba2..fe715daeb8bc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -146,6 +146,25 @@ static bool global_reclaim(struct scan_control *sc) } #endif +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) @@ -1789,7 +1808,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && zone->all_unreclaimable) + if (current_is_kswapd() && !zone_reclaimable(zone)) force_scan = true; if (!global_reclaim(sc)) force_scan = true; @@ -2244,8 +2263,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (global_reclaim(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (zone->all_unreclaimable && - sc->priority != DEF_PRIORITY) + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ if (IS_ENABLED(CONFIG_COMPACTION)) { /* @@ -2283,11 +2302,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) return aborted_reclaim; } -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; -} - /* All zones in zonelist are unreclaimable? */ static bool all_unreclaimable(struct zonelist *zonelist, struct scan_control *sc) @@ -2301,7 +2315,7 @@ static bool all_unreclaimable(struct zonelist *zonelist, continue; if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - if (!zone->all_unreclaimable) + if (zone_reclaimable(zone)) return false; } @@ -2712,7 +2726,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) * DEF_PRIORITY. Effectively, it considers them balanced so * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { + if (!zone_reclaimable(zone)) { balanced_pages += zone->managed_pages; continue; } @@ -2773,7 +2787,6 @@ static bool kswapd_shrink_zone(struct zone *zone, unsigned long lru_pages, unsigned long *nr_attempted) { - unsigned long nr_slab; int testorder = sc->order; unsigned long balance_gap; struct reclaim_state *reclaim_state = current->reclaim_state; @@ -2818,15 +2831,12 @@ static bool kswapd_shrink_zone(struct zone *zone, shrink_zone(zone, sc); reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + shrink_slab(&shrink, sc->nr_scanned, lru_pages); sc->nr_reclaimed += reclaim_state->reclaimed_slab; /* Account for the number of pages attempted to reclaim */ *nr_attempted += sc->nr_to_reclaim; - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - zone_clear_flag(zone, ZONE_WRITEBACK); /* @@ -2835,7 +2845,7 @@ static bool kswapd_shrink_zone(struct zone *zone, * BDIs but as pressure is relieved, speculatively avoid congestion * waits. */ - if (!zone->all_unreclaimable && + if (zone_reclaimable(zone) && zone_balanced(zone, testorder, 0, classzone_idx)) { zone_clear_flag(zone, ZONE_CONGESTED); zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); @@ -2901,8 +2911,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* @@ -2980,8 +2990,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; @@ -3265,20 +3275,6 @@ unsigned long global_reclaimable_pages(void) return nr; } -unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - #ifdef CONFIG_HIBERNATION /* * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of @@ -3576,7 +3572,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* diff --git a/mm/vmstat.c b/mm/vmstat.c index d57a09143bf9..9bb314577911 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -19,6 +19,9 @@ #include #include #include +#include + +#include "internal.h" #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; @@ -1088,7 +1091,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n all_unreclaimable: %u" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone->all_unreclaimable, + !zone_reclaimable(zone), zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); From 187320932dcece9c4b93f38f56d1f888bd5c325f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:38 -0700 Subject: [PATCH 119/303] mm/sparse: introduce alloc_usemap_and_memmap After commit 9bdac9142407 ("sparsemem: Put mem map for one node together."), vmemmap for one node will be allocated together, its logic is similar as memory allocation for pageblock flags. This patch introduces alloc_usemap_and_memmap to extract the same logic of memory alloction for pageblock flags and vmemmap. Signed-off-by: Wanpeng Li Cc: Dave Hansen Cc: Rik van Riel Cc: Fengguang Wu Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Tejun Heo Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Jiri Kosina Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 147 +++++++++++++++++++++++----------------------------- 1 file changed, 64 insertions(+), 83 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 308d50331bc3..4ac1d7ef548f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -339,13 +339,14 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, +static void __init sparse_early_usemaps_alloc_node(void *data, unsigned long pnum_begin, unsigned long pnum_end, unsigned long usemap_count, int nodeid) { void *usemap; unsigned long pnum; + unsigned long **usemap_map = (unsigned long **)data; int size = usemap_size(); usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), @@ -430,11 +431,12 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ #ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER -static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, +static void __init sparse_early_mem_maps_alloc_node(void *data, unsigned long pnum_begin, unsigned long pnum_end, unsigned long map_count, int nodeid) { + struct page **map_map = (struct page **)data; sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, map_count, nodeid); } @@ -460,88 +462,18 @@ void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) { } -/* - * Allocate the accumulated non-linear sections, allocate a mem_map - * for each and record the physical to section mapping. +/** + * alloc_usemap_and_memmap - memory alloction for pageblock flags and vmemmap + * @map: usemap_map for pageblock flags or mmap_map for vmemmap */ -void __init sparse_init(void) +static void __init alloc_usemap_and_memmap(void (*alloc_func) + (void *, unsigned long, unsigned long, + unsigned long, int), void *data) { unsigned long pnum; - struct page *map; - unsigned long *usemap; - unsigned long **usemap_map; - int size; + unsigned long map_count; int nodeid_begin = 0; unsigned long pnum_begin = 0; - unsigned long usemap_count; -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - unsigned long map_count; - int size2; - struct page **map_map; -#endif - - /* see include/linux/mmzone.h 'struct mem_section' definition */ - BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); - - /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ - set_pageblock_order(); - - /* - * map is using big page (aka 2M in x86 64 bit) - * usemap is less one page (aka 24 bytes) - * so alloc 2M (with 2M align) and 24 bytes in turn will - * make next 2M slip to one more 2M later. - * then in big system, the memory will have a lot of holes... - * here try to allocate 2M pages continuously. - * - * powerpc need to call sparse_init_one_section right after each - * sparse_early_mem_map_alloc, so allocate usemap_map at first. - */ - size = sizeof(unsigned long *) * NR_MEM_SECTIONS; - usemap_map = alloc_bootmem(size); - if (!usemap_map) - panic("can not allocate usemap_map\n"); - - for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid_begin = sparse_early_nid(ms); - pnum_begin = pnum; - break; - } - usemap_count = 1; - for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { - struct mem_section *ms; - int nodeid; - - if (!present_section_nr(pnum)) - continue; - ms = __nr_to_section(pnum); - nodeid = sparse_early_nid(ms); - if (nodeid == nodeid_begin) { - usemap_count++; - continue; - } - /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, - usemap_count, nodeid_begin); - /* new start, update count etc*/ - nodeid_begin = nodeid; - pnum_begin = pnum; - usemap_count = 1; - } - /* ok, last chunk */ - sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, - usemap_count, nodeid_begin); - -#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER - size2 = sizeof(struct page *) * NR_MEM_SECTIONS; - map_map = alloc_bootmem(size2); - if (!map_map) - panic("can not allocate map_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { struct mem_section *ms; @@ -567,16 +499,65 @@ void __init sparse_init(void) continue; } /* ok, we need to take cake of from pnum_begin to pnum - 1*/ - sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, - map_count, nodeid_begin); + alloc_func(data, pnum_begin, pnum, + map_count, nodeid_begin); /* new start, update count etc*/ nodeid_begin = nodeid; pnum_begin = pnum; map_count = 1; } /* ok, last chunk */ - sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, - map_count, nodeid_begin); + alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +} + +/* + * Allocate the accumulated non-linear sections, allocate a mem_map + * for each and record the physical to section mapping. + */ +void __init sparse_init(void) +{ + unsigned long pnum; + struct page *map; + unsigned long *usemap; + unsigned long **usemap_map; + int size; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + int size2; + struct page **map_map; +#endif + + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + + /* + * map is using big page (aka 2M in x86 64 bit) + * usemap is less one page (aka 24 bytes) + * so alloc 2M (with 2M align) and 24 bytes in turn will + * make next 2M slip to one more 2M later. + * then in big system, the memory will have a lot of holes... + * here try to allocate 2M pages continuously. + * + * powerpc need to call sparse_init_one_section right after each + * sparse_early_mem_map_alloc, so allocate usemap_map at first. + */ + size = sizeof(unsigned long *) * NR_MEM_SECTIONS; + usemap_map = alloc_bootmem(size); + if (!usemap_map) + panic("can not allocate usemap_map\n"); + alloc_usemap_and_memmap(sparse_early_usemaps_alloc_node, + (void *)usemap_map); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = alloc_bootmem(size2); + if (!map_map) + panic("can not allocate map_map\n"); + alloc_usemap_and_memmap(sparse_early_mem_maps_alloc_node, + (void *)map_map); #endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { From 7d9f073b8da45a894bb7148433bd84d21eed6757 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:40 -0700 Subject: [PATCH 120/303] mm/writeback: make writeback_inodes_wb static It's not used globally and could be static. Signed-off-by: Wanpeng Li Cc: Dave Hansen Cc: Rik van Riel Cc: Fengguang Wu Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Tejun Heo Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Jiri Kosina Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 87d778118027..54b3c31c2f0d 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -723,7 +723,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, +static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) { struct wb_writeback_work work = { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 4e198ca1f685..021b8a319b9e 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -98,8 +98,6 @@ int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason); int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void sync_inodes_sb(struct super_block *); -long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason); void wakeup_flusher_threads(long nr_pages, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); From 762216ab4e175f49d17bc7ad778c57b9028184e6 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:42 -0700 Subject: [PATCH 121/303] mm/vmalloc: use wrapper function get_vm_area_size to caculate size of vm area Use wrapper function get_vm_area_size to calculate size of vm area. Signed-off-by: Wanpeng Li Cc: Dave Hansen Cc: Rik van Riel Cc: Fengguang Wu Cc: Joonsoo Kim Cc: Johannes Weiner Cc: Tejun Heo Cc: Yasuaki Ishimatsu Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Jiri Kosina Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 93d3182c3300..107454312d5e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1258,7 +1258,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { unsigned long addr = (unsigned long)area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; + unsigned long end = addr + get_vm_area_size(area); int err; err = vmap_page_range(addr, end, prot, *pages); @@ -1553,7 +1553,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned int nr_pages, array_size, i; gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; + nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); area->nr_pages = nr_pages; @@ -1985,7 +1985,7 @@ long vread(char *buf, char *addr, unsigned long count) vm = va->vm; vaddr = (char *) vm->addr; - if (addr >= vaddr + vm->size - PAGE_SIZE) + if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) @@ -1995,7 +1995,7 @@ long vread(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + vm->size - PAGE_SIZE - addr; + n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; if (!(vm->flags & VM_IOREMAP)) @@ -2067,7 +2067,7 @@ long vwrite(char *buf, char *addr, unsigned long count) vm = va->vm; vaddr = (char *) vm->addr; - if (addr >= vaddr + vm->size - PAGE_SIZE) + if (addr >= vaddr + get_vm_area_size(vm)) continue; while (addr < vaddr) { if (count == 0) @@ -2076,7 +2076,7 @@ long vwrite(char *buf, char *addr, unsigned long count) addr++; count--; } - n = vaddr + vm->size - PAGE_SIZE - addr; + n = vaddr + get_vm_area_size(vm) - addr; if (n > count) n = count; if (!(vm->flags & VM_IOREMAP)) { From 1ecfd533f4c528b0b4cc5bc115c4c47f0b5e4828 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 11 Sep 2013 14:22:43 -0700 Subject: [PATCH 122/303] mm/mremap.c: call pud_free() after fail calling pmd_alloc() In alloc_new_pmd(), if pud_alloc() was called successfully, but pmd_alloc() fails, avoid leaking `pud'. Signed-off-by: Chen Gang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index 0843feb66f3d..91b13d6a16d4 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include #include #include +#include #include "internal.h" @@ -62,8 +63,10 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; pmd = pmd_alloc(mm, pud, addr); - if (!pmd) + if (!pmd) { + pud_free(mm, pud); return NULL; + } VM_BUG_ON(pmd_trans_huge(*pmd)); From 4c3bffc272755c98728c2b58b1a8148cf9e9fd1f Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 11 Sep 2013 14:22:44 -0700 Subject: [PATCH 123/303] mm/backing-dev.c: check user buffer length before copying data to the related user buffer '*lenp' may be less than "sizeof(kbuf)" so we must check this before the next copy_to_user(). pdflush_proc_obsolete() is called by sysctl which 'procname' is "nr_pdflush_threads", if the user passes buffer length less than "sizeof(kbuf)", it will cause issue. Signed-off-by: Chen Gang Reviewed-by: Jan Kara Cc: Tejun Heo Cc: Jeff Moyer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 37d9edcd14cf..ce682f7a4f29 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -652,7 +652,7 @@ int pdflush_proc_obsolete(struct ctl_table *table, int write, { char kbuf[] = "0\n"; - if (*ppos) { + if (*ppos || *lenp < sizeof(kbuf)) { *lenp = 0; return 0; } From 5a53748568f79641eaf40e41081a2f4987f005c2 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Wed, 11 Sep 2013 14:22:46 -0700 Subject: [PATCH 124/303] mm/page-writeback.c: add strictlimit feature The feature prevents mistrusted filesystems (ie: FUSE mounts created by unprivileged users) to grow a large number of dirty pages before throttling. For such filesystems balance_dirty_pages always check bdi counters against bdi limits. I.e. even if global "nr_dirty" is under "freerun", it's not allowed to skip bdi checks. The only use case for now is fuse: it sets bdi max_ratio to 1% by default and system administrators are supposed to expect that this limit won't be exceeded. The feature is on if a BDI is marked by BDI_CAP_STRICTLIMIT flag. A filesystem may set the flag when it initializes its BDI. The problematic scenario comes from the fact that nobody pays attention to the NR_WRITEBACK_TEMP counter (i.e. number of pages under fuse writeback). The implementation of fuse writeback releases original page (by calling end_page_writeback) almost immediately. A fuse request queued for real processing bears a copy of original page. Hence, if userspace fuse daemon doesn't finalize write requests in timely manner, an aggressive mmap writer can pollute virtually all memory by those temporary fuse page copies. They are carefully accounted in NR_WRITEBACK_TEMP, but nobody cares. To make further explanations shorter, let me use "NR_WRITEBACK_TEMP problem" as a shortcut for "a possibility of uncontrolled grow of amount of RAM consumed by temporary pages allocated by kernel fuse to process writeback". The problem was very easy to reproduce. There is a trivial example filesystem implementation in fuse userspace distribution: fusexmp_fh.c. I added "sleep(1);" to the write methods, then recompiled and mounted it. Then created a huge file on the mount point and run a simple program which mmap-ed the file to a memory region, then wrote a data to the region. An hour later I observed almost all RAM consumed by fuse writeback. Since then some unrelated changes in kernel fuse made it more difficult to reproduce, but it is still possible now. Putting this theoretical happens-in-the-lab thing aside, there is another thing that really hurts real world (FUSE) users. This is write-through page cache policy FUSE currently uses. I.e. handling write(2), kernel fuse populates page cache and flushes user data to the server synchronously. This is excessively suboptimal. Pavel Emelyanov's patches ("writeback cache policy") solve the problem, but they also make resolving NR_WRITEBACK_TEMP problem absolutely necessary. Otherwise, simply copying a huge file to a fuse mount would result in memory starvation. Miklos, the maintainer of FUSE, believes strictlimit feature the way to go. And eventually putting FUSE topics aside, there is one more use-case for strictlimit feature. Using a slow USB stick (mass storage) in a machine with huge amount of RAM installed is a well-known pain. Let's make simple computations. Assuming 64GB of RAM installed, existing implementation of balance_dirty_pages will start throttling only after 9.6GB of RAM becomes dirty (freerun == 15% of total RAM). So, the command "cp 9GB_file /media/my-usb-storage/" may return in a few seconds, but subsequent "umount /media/my-usb-storage/" will take more than two hours if effective throughput of the storage is, to say, 1MB/sec. After inclusion of strictlimit feature, it will be trivial to add a knob (e.g. /sys/devices/virtual/bdi/x:y/strictlimit) to enable it on demand. Manually or via udev rule. May be I'm wrong, but it seems to be quite a natural desire to limit the amount of dirty memory for some devices we are not fully trust (in the sense of sustainable throughput). [akpm@linux-foundation.org: fix warning in page-writeback.c] Signed-off-by: Maxim Patlasov Cc: Jan Kara Cc: Miklos Szeredi Cc: Wu Fengguang Cc: Pavel Emelyanov Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/inode.c | 2 +- include/linux/backing-dev.h | 3 + mm/page-writeback.c | 265 +++++++++++++++++++++++++++--------- 3 files changed, 207 insertions(+), 63 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e0fe703ee3d6..84434594e80e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -930,7 +930,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; err = bdi_init(&fc->bdi); if (err) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c3881553f7d1..5f66d519a726 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,6 +243,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_EXEC_MAP: Can be mapped for execution * * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. + * + * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -254,6 +256,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_NO_ACCT_WB 0x00000080 #define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_STABLE_WRITES 0x00000200 +#define BDI_CAP_STRICTLIMIT 0x00000400 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3750431b3cd8..6c7b0187be8e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -584,6 +584,37 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) return bdi_dirty; } +/* + * setpoint - dirty 3 + * f(dirty) := 1.0 + (----------------) + * limit - setpoint + * + * it's a 3rd order polynomial that subjects to + * + * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast + * (2) f(setpoint) = 1.0 => the balance point + * (3) f(limit) = 0 => the hard limit + * (4) df/dx <= 0 => negative feedback control + * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) + * => fast response on large errors; small oscillation near setpoint + */ +static inline long long pos_ratio_polynom(unsigned long setpoint, + unsigned long dirty, + unsigned long limit) +{ + long long pos_ratio; + long x; + + x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, + limit - setpoint + 1); + pos_ratio = x; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; + pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + + return clamp(pos_ratio, 0LL, 2LL << RATELIMIT_CALC_SHIFT); +} + /* * Dirty position control. * @@ -682,26 +713,80 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, /* * global setpoint * - * setpoint - dirty 3 - * f(dirty) := 1.0 + (----------------) - * limit - setpoint - * - * it's a 3rd order polynomial that subjects to - * - * (1) f(freerun) = 2.0 => rampup dirty_ratelimit reasonably fast - * (2) f(setpoint) = 1.0 => the balance point - * (3) f(limit) = 0 => the hard limit - * (4) df/dx <= 0 => negative feedback control - * (5) the closer to setpoint, the smaller |df/dx| (and the reverse) - * => fast response on large errors; small oscillation near setpoint + * See comment for pos_ratio_polynom(). */ setpoint = (freerun + limit) / 2; - x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, - limit - setpoint + 1); - pos_ratio = x; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; - pos_ratio += 1 << RATELIMIT_CALC_SHIFT; + pos_ratio = pos_ratio_polynom(setpoint, dirty, limit); + + /* + * The strictlimit feature is a tool preventing mistrusted filesystems + * from growing a large number of dirty pages before throttling. For + * such filesystems balance_dirty_pages always checks bdi counters + * against bdi limits. Even if global "nr_dirty" is under "freerun". + * This is especially important for fuse which sets bdi->max_ratio to + * 1% by default. Without strictlimit feature, fuse writeback may + * consume arbitrary amount of RAM because it is accounted in + * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". + * + * Here, in bdi_position_ratio(), we calculate pos_ratio based on + * two values: bdi_dirty and bdi_thresh. Let's consider an example: + * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global + * limits are set by default to 10% and 20% (background and throttle). + * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. + * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is + * about ~6K pages (as the average of background and throttle bdi + * limits). The 3rd order polynomial will provide positive feedback if + * bdi_dirty is under bdi_setpoint and vice versa. + * + * Note, that we cannot use global counters in these calculations + * because we want to throttle process writing to a strictlimit BDI + * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB + * in the example above). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + long long bdi_pos_ratio; + unsigned long bdi_bg_thresh; + + if (bdi_dirty < 8) + return min_t(long long, pos_ratio * 2, + 2 << RATELIMIT_CALC_SHIFT); + + if (bdi_dirty >= bdi_thresh) + return 0; + + bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh); + bdi_setpoint = dirty_freerun_ceiling(bdi_thresh, + bdi_bg_thresh); + + if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh) + return 0; + + bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty, + bdi_thresh); + + /* + * Typically, for strictlimit case, bdi_setpoint << setpoint + * and pos_ratio >> bdi_pos_ratio. In the other words global + * state ("dirty") is not limiting factor and we have to + * make decision based on bdi counters. But there is an + * important case when global pos_ratio should get precedence: + * global limits are exceeded (e.g. due to activities on other + * BDIs) while given strictlimit BDI is below limit. + * + * "pos_ratio * bdi_pos_ratio" would work for the case above, + * but it would look too non-natural for the case of all + * activity in the system coming from a single strictlimit BDI + * with bdi->max_ratio == 100%. + * + * Note that min() below somewhat changes the dynamics of the + * control system. Normally, pos_ratio value can be well over 3 + * (when globally we are at freerun and bdi is well below bdi + * setpoint). Now the maximum pos_ratio in the same situation + * is 2. We might want to tweak this if we observe the control + * system is too slow to adapt. + */ + return min(pos_ratio, bdi_pos_ratio); + } /* * We have computed basic pos_ratio above based on global situation. If @@ -994,6 +1079,27 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * keep that period small to reduce time lags). */ step = 0; + + /* + * For strictlimit case, calculations above were based on bdi counters + * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). + * Hence, to calculate "step" properly, we have to use bdi_dirty as + * "dirty" and bdi_setpoint as "setpoint". + * + * We rampup dirty_ratelimit forcibly if bdi_dirty is low because + * it's possible that bdi_thresh is close to zero due to inactivity + * of backing device (see the implementation of bdi_dirty_limit()). + */ + if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + dirty = bdi_dirty; + if (bdi_dirty < 8) + setpoint = bdi_dirty + 1; + else + setpoint = (bdi_thresh + + bdi_dirty_limit(bdi, bg_thresh)) / 2; + } + if (dirty < setpoint) { x = min(bdi->balanced_dirty_ratelimit, min(balanced_dirty_ratelimit, task_ratelimit)); @@ -1198,6 +1304,56 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } +static inline void bdi_dirty_limits(struct backing_dev_info *bdi, + unsigned long dirty_thresh, + unsigned long background_thresh, + unsigned long *bdi_dirty, + unsigned long *bdi_thresh, + unsigned long *bdi_bg_thresh) +{ + unsigned long bdi_reclaimable; + + /* + * bdi_thresh is not treated as some limiting factor as + * dirty_thresh, due to reasons + * - in JBOD setup, bdi_thresh can fluctuate a lot + * - in a system with HDD and USB key, the USB key may somehow + * go into state (bdi_dirty >> bdi_thresh) either because + * bdi_dirty starts high, or because bdi_thresh drops low. + * In this case we don't want to hard throttle the USB key + * dirtiers for 100 seconds until bdi_dirty drops under + * bdi_thresh. Instead the auxiliary bdi control line in + * bdi_position_ratio() will let the dirtier task progress + * at some rate <= (write_bw / 2) for bringing down bdi_dirty. + */ + *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + + if (bdi_bg_thresh) + *bdi_bg_thresh = div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh); + + /* + * In order to avoid the stacked BDI deadlock we need + * to ensure we accurately count the 'dirty' pages when + * the threshold is low. + * + * Otherwise it would be possible to get thresh+n pages + * reported dirty, even though there are thresh-m pages + * actually dirty; with m+n sitting in the percpu + * deltas. + */ + if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { + bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat_sum(bdi, BDI_WRITEBACK); + } else { + bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); + *bdi_dirty = bdi_reclaimable + + bdi_stat(bdi, BDI_WRITEBACK); + } +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1209,13 +1365,9 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long pages_dirtied) { unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ - unsigned long bdi_reclaimable; unsigned long nr_dirty; /* = file_dirty + writeback + unstable_nfs */ - unsigned long bdi_dirty; - unsigned long freerun; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; long period; long pause; long max_pause; @@ -1226,10 +1378,16 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = mapping->backing_dev_info; + bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; for (;;) { unsigned long now = jiffies; + unsigned long uninitialized_var(bdi_thresh); + unsigned long thresh; + unsigned long uninitialized_var(bdi_dirty); + unsigned long dirty; + unsigned long bg_thresh; /* * Unstable writes are a feature of certain networked @@ -1243,61 +1401,44 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); + if (unlikely(strictlimit)) { + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, &bg_thresh); + + dirty = bdi_dirty; + thresh = bdi_thresh; + } else { + dirty = nr_dirty; + thresh = dirty_thresh; + bg_thresh = background_thresh; + } + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts - * when the bdi limits are ramping up. + * when the bdi limits are ramping up in case of !strictlimit. + * + * In strictlimit case make decision based on the bdi counters + * and limits. Small writeouts when the bdi limits are ramping + * up are the price we consciously pay for strictlimit-ing. */ - freerun = dirty_freerun_ceiling(dirty_thresh, - background_thresh); - if (nr_dirty <= freerun) { + if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) { current->dirty_paused_when = now; current->nr_dirtied = 0; current->nr_dirtied_pause = - dirty_poll_interval(nr_dirty, dirty_thresh); + dirty_poll_interval(dirty, thresh); break; } if (unlikely(!writeback_in_progress(bdi))) bdi_start_background_writeback(bdi); - /* - * bdi_thresh is not treated as some limiting factor as - * dirty_thresh, due to reasons - * - in JBOD setup, bdi_thresh can fluctuate a lot - * - in a system with HDD and USB key, the USB key may somehow - * go into state (bdi_dirty >> bdi_thresh) either because - * bdi_dirty starts high, or because bdi_thresh drops low. - * In this case we don't want to hard throttle the USB key - * dirtiers for 100 seconds until bdi_dirty drops under - * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress - * at some rate <= (write_bw / 2) for bringing down bdi_dirty. - */ - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); - - /* - * In order to avoid the stacked BDI deadlock we need - * to ensure we accurately count the 'dirty' pages when - * the threshold is low. - * - * Otherwise it would be possible to get thresh+n pages - * reported dirty, even though there are thresh-m pages - * actually dirty; with m+n sitting in the percpu - * deltas. - */ - if (bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); - } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); - } + if (!strictlimit) + bdi_dirty_limits(bdi, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, NULL); dirty_exceeded = (bdi_dirty > bdi_thresh) && - (nr_dirty > dirty_thresh); + ((nr_dirty > dirty_thresh) || strictlimit); if (dirty_exceeded && !bdi->dirty_exceeded) bdi->dirty_exceeded = 1; From fa0f281cf9de8e6877e6536f18a3fc77368df64d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 11 Sep 2013 14:22:47 -0700 Subject: [PATCH 125/303] mm: make sure _PAGE_SWP_SOFT_DIRTY bit is not set on present pte _PAGE_SOFT_DIRTY bit should never be set on present pte so add VM_BUG_ON to catch any potential future abuse. Also add a comment on _PAGE_SWP_SOFT_DIRTY definition explaining scope of its usage. Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Acked-by: Jan Beulich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 34 ++++++++++++++++------------ arch/x86/include/asm/pgtable_types.h | 3 +++ 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 8d16befdec88..3d1999458709 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -315,21 +315,6 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY); } -static inline pte_t pte_swp_mksoft_dirty(pte_t pte) -{ - return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - -static inline int pte_swp_soft_dirty(pte_t pte) -{ - return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; -} - -static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) -{ - return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); -} - static inline pte_t pte_file_clear_soft_dirty(pte_t pte) { return pte_clear_flags(pte, _PAGE_SOFT_DIRTY); @@ -446,6 +431,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include +#include #include static inline int pte_none(pte_t pte) @@ -864,6 +850,24 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +static inline pte_t pte_swp_mksoft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + +static inline int pte_swp_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; +} + +static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) +{ + VM_BUG_ON(pte_present(pte)); + return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); +} + #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index f4843e031131..0ecac257fb26 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -75,6 +75,9 @@ * with swap entry format. On x86 bits 6 and 7 are *not* involved * into swap entry computation, but bit 6 is used for nonlinear * file mapping, so we borrow bit 7 for soft dirty tracking. + * + * Please note that this bit must be treated as swap dirty page + * mark if and only if the PTE has present bit clear! */ #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SWP_SOFT_DIRTY _PAGE_PSE From cf6fe945389e674130dc7564392930cf7fb9f5e8 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 11 Sep 2013 14:22:48 -0700 Subject: [PATCH 126/303] mm: correct the comment about the value for buddy _mapcount Set _mapcount PAGE_BUDDY_MAPCOUNT_VALUE to make the page buddy. Not the magic number -2. Signed-off-by: Wang Sheng-Hui Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff2782576e39..0ee638f76ebe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -489,8 +489,10 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount -2. - * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. + * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is + * serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -528,8 +530,9 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount -2. Page's - * order is recorded in page_private(page) field. + * free pages of length of (1 << order) and marked with _mapcount + * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) + * field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. From 0d6fdbdb2a651f0c9bb979e1d92b1e15dadffc4f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:49 -0700 Subject: [PATCH 127/303] hwpoison: always unset MIGRATE_ISOLATE before returning from soft_offline_page() Soft offline code expects that MIGRATE_ISOLATE is set on the target page only during soft offlining work. But currenly it doesn't work as expected when get_any_page() fails and returns negative value. In the result, end users can have unexpectedly isolated pages. This patch just fixes it. Signed-off-by: Naoya Horiguchi Reviewed-by: Wanpeng Li Cc: Andi Kleen Cc: Fengguang Wu Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e05ed31c0f61..c8cc57ed7dcd 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1553,7 +1553,7 @@ int soft_offline_page(struct page *page, int flags) ret = get_any_page(page, pfn, flags); if (ret < 0) - return ret; + goto unset; if (ret) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); @@ -1570,6 +1570,7 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } +unset: unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } From 841fcc583f81c632d20a27e17beccb20320530a1 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:50 -0700 Subject: [PATCH 128/303] mm/hwpoison: fix loss of PG_dirty for errors on mlocked pages memory_failure() store the page flag of the error page before doing unmap, and (only) if the first check with page flags at the time decided the error page is unknown, it do the second check with the stored page flag since memory_failure() does unmapping of the error pages before doing page_action(). This unmapping changes the page state, especially page_remove_rmap() (called from try_to_unmap_one()) clears PG_mlocked, so page_action() can't catch mlocked pages after that. However, memory_failure() can't handle memory errors on dirty mlocked pages correctly. try_to_unmap_one will move the dirty bit from pte to the physical page, the second check lose it since it check the stored page flag. This patch fix it by restore PG_dirty flag to stored page flag if the page is dirty. Testcase: #define _GNU_SOURCE #include #include #include #include #include #define PAGES_TO_TEST 2 #define PAGE_SIZE 4096 int main(void) { char *mem; int i; mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_LOCKED, 0, 0); for (i = 0; i < PAGES_TO_TEST; i++) mem[i * PAGE_SIZE] = 'a'; if (madvise(mem, PAGES_TO_TEST * PAGE_SIZE, MADV_HWPOISON) == -1) return -1; return 0; } Before patch: [ 912.839247] Injecting memory failure for page 7dfb8 at 7f6b4e37b000 [ 912.839257] MCE 0x7dfb8: clean mlocked LRU page recovery: Recovered [ 912.845550] MCE 0x7dfb8: clean mlocked LRU page still referenced by 1 users [ 912.852586] Injecting memory failure for page 7e6aa at 7f6b4e37c000 [ 912.852594] MCE 0x7e6aa: clean mlocked LRU page recovery: Recovered [ 912.858936] MCE 0x7e6aa: clean mlocked LRU page still referenced by 1 users After patch: [ 163.590225] Injecting memory failure for page 91bc2f at 7f9f5b0e5000 [ 163.590264] MCE 0x91bc2f: dirty mlocked LRU page recovery: Recovered [ 163.596680] MCE 0x91bc2f: dirty mlocked LRU page still referenced by 1 users [ 163.603831] Injecting memory failure for page 91cdd3 at 7f9f5b0e6000 [ 163.603852] MCE 0x91cdd3: dirty mlocked LRU page recovery: Recovered [ 163.610305] MCE 0x91cdd3: dirty mlocked LRU page still referenced by 1 users Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c8cc57ed7dcd..ec9ad5270d32 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1204,6 +1204,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags) for (ps = error_states;; ps++) if ((p->flags & ps->mask) == ps->res) break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + if (!ps->mask) for (ps = error_states;; ps++) if ((page_flags & ps->mask) == ps->res) From f9121153fdfbfaa930bf65077a5597e20d3ac608 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:52 -0700 Subject: [PATCH 129/303] mm/hwpoison: don't need to hold compound lock for hugetlbfs page compound lock is introduced by commit e9da73d67("thp: compound_lock."), it is used to serialize put_page against __split_huge_page_refcount(). In addition, transparent hugepages will be splitted in hwpoison handler and just one subpage will be poisoned. There is unnecessary to hold compound lock for hugetlbfs page. This patch replace compound_trans_order by compond_order in the place where the page is hugetlbfs page. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 -------------- mm/memory-failure.c | 12 ++++++------ 2 files changed, 6 insertions(+), 20 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 03f84b8d7359..caf543c7eaa7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -495,20 +495,6 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } -static inline int compound_trans_order(struct page *page) -{ - int order; - unsigned long flags; - - if (!PageHead(page)) - return 0; - - flags = compound_lock_irqsave(page); - order = compound_order(page); - compound_unlock_irqrestore(page, flags); - return order; -} - static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ec9ad5270d32..7b5d32507c35 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -206,7 +206,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; if ((flags & MF_ACTION_REQUIRED) && t == current) { si.si_code = BUS_MCEERR_AR; @@ -983,7 +983,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -991,7 +991,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -1342,7 +1342,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_trans_order(page); + nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { /* @@ -1506,7 +1506,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } else { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_trans_order(hpage), + atomic_long_add(1 << compound_order(hpage), &num_poisoned_pages); } return ret; @@ -1566,7 +1566,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_trans_order(hpage), + atomic_long_add(1 << compound_order(hpage), &num_poisoned_pages); } else { SetPageHWPoison(page); From 0cea3fdc416d593072c602725ed2ca02b889f31b Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:53 -0700 Subject: [PATCH 130/303] mm/hwpoison: fix race against poison thp There is a race between hwpoison page and unpoison page, memory_failure set the page hwpoison and increase num_poisoned_pages without hold page lock, and one page count will be accounted against thp for num_poisoned_pages. However, unpoison can occur before memory_failure hold page lock and split transparent hugepage, unpoison will decrease num_poisoned_pages by 1 << compound_order since memory_failure has not yet split transparent hugepage with page lock held. That means we account one page for hwpoison and 1 << compound_order for unpoison. This patch fix it by inserting a PageTransHuge check before doing TestClearPageHWPoison, unpoison failed without clearing PageHWPoison and decreasing num_poisoned_pages. A B memory_failue TestSetPageHWPoison(p); if (PageHuge(p)) nr_pages = 1 << compound_order(hpage); else nr_pages = 1; atomic_long_add(nr_pages, &num_poisoned_pages); unpoison_memory nr_pages = 1<< compound_trans_order(page); if(TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &num_poisoned_pages); lock page if (!PageHWPoison(p)) unlock page and return hwpoison_user_mappings if (PageTransHuge(hpage)) split_huge_page(hpage); Signed-off-by: Wanpeng Li Suggested-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7b5d32507c35..32351ec32048 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1342,6 +1342,16 @@ int unpoison_memory(unsigned long pfn) return 0; } + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (PageTransHuge(page)) { + pr_info("MCE: Memory failure is now running on %#lx\n", pfn); + return 0; + } + nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { From dd9538a597f9ccd9a65be1cc3f71059a12b5b4ff Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:54 -0700 Subject: [PATCH 131/303] mm/hwpoison: replace atomic_long_sub() with atomic_long_dec() Replace atomic_long_sub() with atomic_long_dec() since the page is normal page instead of hugetlbfs page or thp. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 32351ec32048..c69217c07faa 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1366,7 +1366,7 @@ int unpoison_memory(unsigned long pfn) return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); + atomic_long_dec(&num_poisoned_pages); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } From 0be35096a145290cc0771d52adb3b241dca22604 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:55 -0700 Subject: [PATCH 132/303] mm/hwpoison: don't set migration type twice to avoid holding heavily contend zone->lock Set pageblock migration type will hold zone->lock which is heavy contended in system to avoid race. However, soft offline page will set pageblock migration type twice during get page if the page is in used, not hugetlbfs page and not on lru list. There is unnecessary to set the pageblock migration type and hold heavy contended zone->lock again if the first round get page have already set the pageblock to right migration type. The trick here is migration type is MIGRATE_ISOLATE. There are other two parts can change MIGRATE_ISOLATE except hwpoison. One is memory hoplug, however, we hold lock_memory_hotplug() which avoid race. The second is CMA which umovable page allocation requst can't fallback to. So it's safe here. Signed-off-by: Wanpeng Li Cc: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c69217c07faa..784a1e17c905 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1429,7 +1429,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) * was free. This flag should be kept set until the source page * is freed and PG_hwpoison on it is set. */ - set_migratetype_isolate(p, true); + if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) + set_migratetype_isolate(p, true); /* * When the target page is a free hugepage, just remove it * from free hugepage list. From 86e057734bd1c460c48ae69f8fcc3ed90eb40d59 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:56 -0700 Subject: [PATCH 133/303] mm/hwpoison: drop forward reference declarations __soft_offline_page() Drop forward reference declarations __soft_offline_page. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 130 ++++++++++++++++++++++---------------------- 1 file changed, 64 insertions(+), 66 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 784a1e17c905..d04f99004c9f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1523,72 +1523,6 @@ static int soft_offline_huge_page(struct page *page, int flags) return ret; } -static int __soft_offline_page(struct page *page, int flags); - -/** - * soft_offline_page - Soft offline a page. - * @page: page to offline - * @flags: flags. Same as memory_failure(). - * - * Returns 0 on success, otherwise negated errno. - * - * Soft offline a page, by migration or invalidation, - * without killing anything. This is for the case when - * a page is not corrupted yet (so it's still valid to access), - * but has had a number of corrected errors and is better taken - * out. - * - * The actual policy on when to do that is maintained by - * user space. - * - * This should never impact any application or cause data loss, - * however it might take some time. - * - * This is not a 100% solution for all memory, but tries to be - * ``good enough'' for the majority of memory. - */ -int soft_offline_page(struct page *page, int flags) -{ - int ret; - unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_trans_head(page); - - if (PageHWPoison(page)) { - pr_info("soft offline: %#lx page already poisoned\n", pfn); - return -EBUSY; - } - if (!PageHuge(page) && PageTransHuge(hpage)) { - if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { - pr_info("soft offline: %#lx: failed to split THP\n", - pfn); - return -EBUSY; - } - } - - ret = get_any_page(page, pfn, flags); - if (ret < 0) - goto unset; - if (ret) { /* for in-use pages */ - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - } else { /* for free pages */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); - } else { - SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); - } - } -unset: - unset_migratetype_isolate(page, MIGRATE_MOVABLE); - return ret; -} - static int __soft_offline_page(struct page *page, int flags) { int ret; @@ -1675,3 +1609,67 @@ static int __soft_offline_page(struct page *page, int flags) } return ret; } + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_trans_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + return -EBUSY; + } + } + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + goto unset; + if (ret) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + atomic_long_add(1 << compound_order(hpage), + &num_poisoned_pages); + } else { + SetPageHWPoison(page); + atomic_long_inc(&num_poisoned_pages); + } + } +unset: + unset_migratetype_isolate(page, MIGRATE_MOVABLE); + return ret; +} From b194b8cdb83daafd2405fb902193b8e904107614 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:57 -0700 Subject: [PATCH 134/303] mm/hwpoison: add '#' to madvise_hwpoison Add '#' to madvise_hwpoison. Before patch: [ 95.892866] Injecting memory failure for page 19d0 at b7786000 [ 95.893151] MCE 0x19d0: non LRU page recovery: Ignored After patch: [ 95.892866] Injecting memory failure for page 0x19d0 at 0xb7786000 [ 95.893151] MCE 0x19d0: non LRU page recovery: Ignored Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 936799f042cc..9b1c7be182d7 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -353,14 +353,14 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) if (ret != 1) return ret; if (bhv == MADV_SOFT_OFFLINE) { - printk(KERN_INFO "Soft offlining page %lx at %lx\n", + pr_info("Soft offlining page %#lx at %#lx\n", page_to_pfn(p), start); ret = soft_offline_page(p, MF_COUNT_INCREASED); if (ret) break; continue; } - printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + pr_info("Injecting memory failure for page %#lx at %#lx\n", page_to_pfn(p), start); /* Ignore return value for now */ memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); From 29b4eedee67b449534214058e1bcb36307a7f1dc Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:22:59 -0700 Subject: [PATCH 135/303] mm/hwpoison.c: fix held reference count after unpoisoning empty zero page madvise hwpoison inject will poison the read-only empty zero page if there is no write access before poison. Empty zero page reference count will be increased for hwpoison, subsequent poison zero page will return directly since page has already been set PG_hwpoison, however, page reference count is still increased by get_user_pages_fast. The unpoison process will unpoison the empty zero page and decrease the reference count successfully for the fist time, however, subsequent unpoison empty zero page will return directly since page has already been unpoisoned and without decrease the page reference count of empty zero page. This patch fixes it by make madvise_hwpoison() put a page and return immediately (without calling memory_failure() or soft_offline_page()) when the page is already hwpoisoned. Testcase: #define _GNU_SOURCE #include #include #include #include #include #include #include #define PAGES_TO_TEST 3 #define PAGE_SIZE 4096 int main(void) { char *mem; int i; mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (madvise(mem, PAGES_TO_TEST * PAGE_SIZE, MADV_HWPOISON) == -1) return -1; munmap(mem, PAGES_TO_TEST * PAGE_SIZE); return 0; } Add printk to dump page reference count: [ 93.075959] Injecting memory failure for page 0x19d0 at 0xb77d8000 [ 93.076207] MCE 0x19d0: non LRU page recovery: Ignored [ 93.076209] pfn 0x19d0, page count = 1 after memory failure [ 93.076220] Injecting memory failure for page 0x19d0 at 0xb77d9000 [ 93.076221] MCE 0x19d0: already hardware poisoned [ 93.076222] pfn 0x19d0, page count = 2 after memory failure [ 93.076224] Injecting memory failure for page 0x19d0 at 0xb77da000 [ 93.076224] MCE 0x19d0: already hardware poisoned [ 93.076225] pfn 0x19d0, page count = 3 after memory failure Signed-off-by: Wanpeng Li Suggested-by: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/madvise.c b/mm/madvise.c index 9b1c7be182d7..30293ab95b06 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -352,6 +352,10 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) int ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + if (PageHWPoison(p)) { + put_page(p); + continue; + } if (bhv == MADV_SOFT_OFFLINE) { pr_info("Soft offlining page %#lx at %#lx\n", page_to_pfn(p), start); From 2d1e8b3f1acc36101dbe6c5fc14e88e3f6af0b1c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:23:00 -0700 Subject: [PATCH 136/303] mm/hwpoison-inject.c: change permission of corrupt-pfn/unpoison-pfn to 0200 Hwpoison injection doesn't implement read method for corrupt-pfn/unpoison-pfn attributes: # cat /sys/kernel/debug/hwpoison/corrupt-pfn cat: /sys/kernel/debug/hwpoison/corrupt-pfn: Permission denied # cat /sys/kernel/debug/hwpoison/unpoison-pfn cat: /sys/kernel/debug/hwpoison/unpoison-pfn: Permission denied This patch changes the permission of corrupt-pfn/unpoison-pfn to 0200. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hwpoison-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 3a61efc518d5..afc2daa91c60 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -88,12 +88,12 @@ static int pfn_inject_init(void) * hardware status change, hence do not require hardware support. * They are mainly for testing hwpoison in software level. */ - dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, NULL, &hwpoison_fops); if (!dentry) goto fail; - dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, NULL, &unpoison_fops); if (!dentry) goto fail; From 3ba5eebc40a9839226e5f0d81a3e9f8fcfb8ebae Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:23:01 -0700 Subject: [PATCH 137/303] mm/memory-failure.c: fix bug triggered by unpoisoning empty zero page Injecting memory failure for page 0x19d0 at 0xb77d2000 MCE 0x19d0: non LRU page recovery: Ignored MCE: Software-unpoisoned page 0x19d0 BUG: Bad page state in process bash pfn:019d0 page:f3461a00 count:0 mapcount:0 mapping: (null) index:0x0 page flags: 0x40000404(referenced|reserved) Modules linked in: nfsd auth_rpcgss i915 nfs_acl nfs lockd video drm_kms_helper drm bnep rfcomm sunrpc bluetooth psmouse parport_pc ppdev lp serio_raw fscache parport gpio_ich lpc_ich mac_hid i2c_algo_bit tpm_tis wmi usb_storage hid_generic usbhid hid e1000e firewire_ohci firewire_core ahci ptp libahci pps_core crc_itu_t CPU: 3 PID: 2123 Comm: bash Not tainted 3.11.0-rc6+ #12 Hardware name: LENOVO 7034DD7/ , BIOS 9HKT47AUS 01//2012 00000000 00000000 e9625ea0 c15ec49b f3461a00 e9625eb8 c15ea119 c17cbf18 ef084314 000019d0 f3461a00 e9625ed8 c110dc8a f3461a00 00000001 00000000 f3461a00 40000404 00000000 e9625ef8 c110dcc1 f3461a00 f3461a00 000019d0 Call Trace: dump_stack+0x41/0x52 bad_page+0xcf/0xeb free_pages_prepare+0x12a/0x140 free_hot_cold_page+0x21/0x110 __put_single_page+0x21/0x30 put_page+0x25/0x40 unpoison_memory+0x107/0x200 hwpoison_unpoison+0x20/0x30 simple_attr_write+0xb6/0xd0 vfs_write+0xa0/0x1b0 SyS_write+0x4f/0x90 sysenter_do_call+0x12/0x22 Disabling lock debugging due to kernel taint Testcase: #define _GNU_SOURCE #include #include #include #include #include #include #include #define PAGES_TO_TEST 1 #define PAGE_SIZE 4096 int main(void) { char *mem; mem = mmap(NULL, PAGES_TO_TEST * PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); if (madvise(mem, PAGES_TO_TEST * PAGE_SIZE, MADV_HWPOISON) == -1) return -1; munmap(mem, PAGES_TO_TEST * PAGE_SIZE); return 0; } There is one page reference count for default empty zero page, madvise_hwpoison add another one by get_user_pages_fast. memory_hwpoison reduce one page reference count since it's a non LRU page. unpoison_memory release the last page reference count and free empty zero page to buddy system which is not correct since empty zero page has PG_reserved flag. This patch fix it by don't reduce the page reference count under 1 against empty zero page. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d04f99004c9f..d472e14c6808 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1388,7 +1388,7 @@ int unpoison_memory(unsigned long pfn) unlock_page(page); put_page(page); - if (freeit) + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); return 0; From 8302423b8e85ad6caa8687f06157d43f684a42e2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 11 Sep 2013 14:23:02 -0700 Subject: [PATCH 138/303] mm/madvise.c: fix return value of madvise_hwpoison() The return value outside for loop is always zero which means madvise_hwpoison return success, however, this is not truth for soft_offline_page w/ failure return value. Signed-off-by: Wanpeng Li Reviewed-by: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 30293ab95b06..51bffa414027 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -361,7 +361,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) page_to_pfn(p), start); ret = soft_offline_page(p, MF_COUNT_INCREASED); if (ret) - break; + return ret; continue; } pr_info("Injecting memory failure for page %#lx at %#lx\n", From 325c4ef5c4b17372c3222d896040d7848e67fbdb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:23:03 -0700 Subject: [PATCH 139/303] mm/madvise.c:madvise_hwpoison(): remove local `ret' madvise_hwpoison() has two locals called "ret". Fix it all up. Cc: Wanpeng Li Cc: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/madvise.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 51bffa414027..6975bc812542 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -343,15 +343,16 @@ static long madvise_remove(struct vm_area_struct *vma, */ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) { - int ret = 0; - if (!capable(CAP_SYS_ADMIN)) return -EPERM; for (; start < end; start += PAGE_SIZE) { struct page *p; - int ret = get_user_pages_fast(start, 1, 0, &p); + int ret; + + ret = get_user_pages_fast(start, 1, 0, &p); if (ret != 1) return ret; + if (PageHWPoison(p)) { put_page(p); continue; @@ -369,7 +370,7 @@ static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) /* Ignore return value for now */ memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); } - return ret; + return 0; } #endif From 146d7009b45cdb45ec3be8ad73177dae58f4bc91 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 11 Sep 2013 14:23:04 -0700 Subject: [PATCH 140/303] writeback: fix race that cause writeback hung There is a race between mark inode dirty and writeback thread, see the following scenario. In this case, writeback thread will not run though there is dirty_io. __mark_inode_dirty() bdi_writeback_workfn() ... ... spin_lock(&inode->i_lock); ... if (bdi_cap_writeback_dirty(bdi)) { <<< assume wb has dirty_io, so wakeup_bdi is false. <<< the following inode_dirty also have wakeup_bdi false. if (!wb_has_dirty_io(&bdi->wb)) wakeup_bdi = true; } spin_unlock(&inode->i_lock); <<< assume last dirty_io is removed here. pages_written = wb_do_writeback(wb); ... <<< work_list empty and wb has no dirty_io, <<< delayed_work will not be queued. if (!list_empty(&bdi->work_list) || (wb_has_dirty_io(wb) && dirty_writeback_interval)) queue_delayed_work(bdi_wq, &wb->dwork, msecs_to_jiffies(dirty_writeback_interval * 10)); spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; <<< new dirty_io is added. list_move(&inode->i_wb_list, &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); <<< though there is dirty_io, but wakeup_bdi is false, <<< so writeback thread will not be waked up and <<< the new dirty_io will not be flushed. if (wakeup_bdi) bdi_wakeup_thread_delayed(bdi); Writeback will run until there is a new flush work queued. This may cause a lot of dirty pages stay in memory for a long time. Signed-off-by: Junxiao Bi Reviewed-by: Jan Kara Cc: Fengguang Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 54b3c31c2f0d..30f6f27d5a59 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1171,6 +1171,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) bool wakeup_bdi = false; bdi = inode_to_bdi(inode); + spin_unlock(&inode->i_lock); + spin_lock(&bdi->wb.list_lock); if (bdi_cap_writeback_dirty(bdi)) { WARN(!test_bit(BDI_registered, &bdi->state), "bdi-%s not registered\n", bdi->name); @@ -1185,8 +1187,6 @@ void __mark_inode_dirty(struct inode *inode, int flags) wakeup_bdi = true; } - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); inode->dirtied_when = jiffies; list_move(&inode->i_wb_list, &bdi->wb.b_dirty); spin_unlock(&bdi->wb.list_lock); From 2d8a17813ec817fa58addd2c92b4ca8cae5bafbb Mon Sep 17 00:00:00 2001 From: Yanchuan Nian Date: Wed, 11 Sep 2013 14:23:05 -0700 Subject: [PATCH 141/303] mm/mmap: remove unnecessary assignment pgoff is not used after the statement "pgoff = vma->vm_pgoff;", so the assignment is redundant. Signed-off-by: Yanchuan Nian Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 51958d192a48..9d548512ff8a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1566,7 +1566,6 @@ munmap_back: WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; - pgoff = vma->vm_pgoff; vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); From ade34a35722fab0c8a1d162a15b919d20373a894 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:06 -0700 Subject: [PATCH 142/303] lib/genalloc.c: convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index 2a39bf62d8c1..c522facfa3e5 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -187,7 +187,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy int nbytes = sizeof(struct gen_pool_chunk) + BITS_TO_LONGS(nbits) * sizeof(long); - chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid); + chunk = kzalloc_node(nbytes, GFP_KERNEL, nid); if (unlikely(chunk == NULL)) return -ENOMEM; From 7b5219db00d0afaf3d2b0e8c443ffa892455ba75 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:07 -0700 Subject: [PATCH 143/303] mm/mempool.c: convert kmalloc_node(...GFP_ZERO...) to kzalloc_node(...) Use the helper function instead of __GFP_ZERO. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index 54990476c049..659aa42bad16 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -73,7 +73,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, gfp_t gfp_mask, int node_id) { mempool_t *pool; - pool = kmalloc_node(sizeof(*pool), gfp_mask | __GFP_ZERO, node_id); + pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; pool->elements = kmalloc_node(min_nr * sizeof(void *), From 2bff24a3707093c435ab3241c47dcdb5f16e432b Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 11 Sep 2013 14:23:08 -0700 Subject: [PATCH 144/303] memcg: fix multiple large threshold notifications A memory cgroup with (1) multiple threshold notifications and (2) at least one threshold >=2G was not reliable. Specifically the notifications would either not fire or would not fire in the proper order. The __mem_cgroup_threshold() signaling logic depends on keeping 64 bit thresholds in sorted order. mem_cgroup_usage_register_event() sorts them with compare_thresholds(), which returns the difference of two 64 bit thresholds as an int. If the difference is positive but has bit[31] set, then sort() treats the difference as negative and breaks sort order. This fix compares the two arbitrary 64 bit thresholds returning the classic -1, 0, 1 result. The test below sets two notifications (at 0x1000 and 0x81001000): cd /sys/fs/cgroup/memory mkdir x for x in 4096 2164264960; do cgroup_event_listener x/memory.usage_in_bytes $x | sed "s/^/$x listener:/" & done echo $$ > x/cgroup.procs anon_leaker 500M v3.11-rc7 fails to signal the 4096 event listener: Leaking... Done leaking pages. Patched v3.11-rc7 properly notifies: Leaking... 4096 listener:2013:8:31:14:13:36 Done leaking pages. The fixed bug is old. It appears to date back to the introduction of memcg threshold notifications in v2.6.34-rc1-116-g2e72b6347c94 "memcg: implement memory thresholds" Signed-off-by: Greg Thelen Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5ca1dcf77ce9..c6bd28edd533 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5591,7 +5591,13 @@ static int compare_thresholds(const void *a, const void *b) const struct mem_cgroup_threshold *_a = a; const struct mem_cgroup_threshold *_b = b; - return _a->threshold - _b->threshold; + if (_a->threshold > _b->threshold) + return 1; + + if (_a->threshold < _b->threshold) + return -1; + + return 0; } static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) From 729377d559607ea40d714e8f7092f40f643cf01f Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 11 Sep 2013 14:23:09 -0700 Subject: [PATCH 145/303] pnp: change pnp bus pm_ops to invoke pnp driver dev_pm_ops if specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pnp_bus_suspend() and pnp_bus_resume() invoke legacy pm_ops from pnp_driver. Changed pnp_bus_suspend() and pnp_bus_resume() to check if pnp driver has dev_pm_ops and call. If dev_pm_ops don't exist, then call use legacy pm_ops. Without this change, pnp_driver dev_pm_ops will not get called. In addition to the pnp driver bus pm_ops change to invoke driver dev_pm_ops, this patch set contains changes to rtc-cmos, tpm_tis, and apple-gmux pnp drivers to convert from legacy pm_ops to dev_pm_ops. This patch (of 4): pnp_bus_suspend() and pnp_bus_resume() invoke legacy pm_ops from pnp_driver. Changed pnp_bus_suspend() and pnp_bus_resume() to check if pnp driver has dev_pm_ops and call. If dev_pm_ops don't exist, then call use legacy pm_ops. Without this change, pnp_driver dev_pm_ops will not get called. Signed-off-by: Shuah Khan Cc: Matthew Garrett Cc: Leonidas Da Silva Barbosa Cc: Ashley Lai Cc: Rajiv Andrade Cc: Marcel Selhorst Cc: Sirrix AG Cc: Alessandro Zummo Cc: "Rafael J. Wysocki" Cc: Bjorn Helgaas Cc: Grant Likely Cc: Rob Herring Cc: Peter Hüwe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pnp/driver.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/pnp/driver.c b/drivers/pnp/driver.c index 12adb43a0693..a39ee38a9414 100644 --- a/drivers/pnp/driver.c +++ b/drivers/pnp/driver.c @@ -163,6 +163,13 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state) if (!pnp_drv) return 0; + if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) { + error = pnp_drv->driver.pm->suspend(dev); + suspend_report_result(pnp_drv->driver.pm->suspend, error); + if (error) + return error; + } + if (pnp_drv->suspend) { error = pnp_drv->suspend(pnp_dev, state); if (error) @@ -211,6 +218,12 @@ static int pnp_bus_resume(struct device *dev) return error; } + if (pnp_drv->driver.pm && pnp_drv->driver.pm->resume) { + error = pnp_drv->driver.pm->resume(dev); + if (error) + return error; + } + if (pnp_drv->resume) { error = pnp_drv->resume(pnp_dev); if (error) From a8a3808b43a077fbc738b26dc84d18b5db3044f9 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 11 Sep 2013 14:23:11 -0700 Subject: [PATCH 146/303] rtc: convert rtc-cmos to dev_pm_ops from legacy pm_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert drivers/rtc/rtc-cmos to use dev_pm_ops instead of legacy pm_ops. This patch depends on pnp driver bus ops change to invoke pnp_driver dev_pm_ops. Signed-off-by: Shuah Khan Cc: Matthew Garrett Cc: Leonidas Da Silva Barbosa Cc: Ashley Lai Cc: Rajiv Andrade Cc: Marcel Selhorst Cc: Sirrix AG Cc: Alessandro Zummo Cc: "Rafael J. Wysocki" Cc: Bjorn Helgaas Cc: Grant Likely Cc: Rob Herring Cc: Peter Hüwe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-cmos.c | 24 +++++------------------- 1 file changed, 5 insertions(+), 19 deletions(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index be06d7150de5..24e733c98f8b 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -1018,23 +1018,6 @@ static void __exit cmos_pnp_remove(struct pnp_dev *pnp) cmos_do_remove(&pnp->dev); } -#ifdef CONFIG_PM - -static int cmos_pnp_suspend(struct pnp_dev *pnp, pm_message_t mesg) -{ - return cmos_suspend(&pnp->dev); -} - -static int cmos_pnp_resume(struct pnp_dev *pnp) -{ - return cmos_resume(&pnp->dev); -} - -#else -#define cmos_pnp_suspend NULL -#define cmos_pnp_resume NULL -#endif - static void cmos_pnp_shutdown(struct pnp_dev *pnp) { if (system_state == SYSTEM_POWER_OFF && !cmos_poweroff(&pnp->dev)) @@ -1060,8 +1043,11 @@ static struct pnp_driver cmos_pnp_driver = { /* flag ensures resume() gets called, and stops syslog spam */ .flags = PNP_DRIVER_RES_DO_NOT_CHANGE, - .suspend = cmos_pnp_suspend, - .resume = cmos_pnp_resume, +#ifdef CONFIG_PM_SLEEP + .driver = { + .pm = &cmos_pm_ops, + }, +#endif }; #endif /* CONFIG_PNP */ From a2fa3fb0d9a0169b10789ea3e5ea7168494df93c Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 11 Sep 2013 14:23:13 -0700 Subject: [PATCH 147/303] tpm: convert tpm_tis driver to use dev_pm_ops from legacy pm_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert drivers/char/tpm/tpm_tis.c to use dev_pm_ops instead of legacy pm_ops. This patch depends on pnp driver bus ops change to invoke pnp_driver dev_pm_ops. Signed-off-by: Shuah Khan Cc: Matthew Garrett Cc: Leonidas Da Silva Barbosa Cc: Ashley Lai Cc: Rajiv Andrade Cc: Marcel Selhorst Cc: Sirrix AG Cc: Alessandro Zummo Cc: "Rafael J. Wysocki" Cc: Bjorn Helgaas Cc: Grant Likely Cc: Rob Herring Cc: Peter Hüwe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/tpm/tpm_tis.c | 60 +++++++++++++++----------------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 4519cb332987..5796d0157ce0 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -766,6 +766,25 @@ static void tpm_tis_reenable_interrupts(struct tpm_chip *chip) } #endif +#ifdef CONFIG_PM_SLEEP +static int tpm_tis_resume(struct device *dev) +{ + struct tpm_chip *chip = dev_get_drvdata(dev); + int ret; + + if (chip->vendor.irq) + tpm_tis_reenable_interrupts(chip); + + ret = tpm_pm_resume(dev); + if (!ret) + tpm_do_selftest(chip); + + return ret; +} +#endif + +static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume); + #ifdef CONFIG_PNP static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev, const struct pnp_device_id *pnp_id) @@ -787,26 +806,6 @@ static int tpm_tis_pnp_init(struct pnp_dev *pnp_dev, return tpm_tis_init(&pnp_dev->dev, start, len, irq); } -static int tpm_tis_pnp_suspend(struct pnp_dev *dev, pm_message_t msg) -{ - return tpm_pm_suspend(&dev->dev); -} - -static int tpm_tis_pnp_resume(struct pnp_dev *dev) -{ - struct tpm_chip *chip = pnp_get_drvdata(dev); - int ret; - - if (chip->vendor.irq) - tpm_tis_reenable_interrupts(chip); - - ret = tpm_pm_resume(&dev->dev); - if (!ret) - tpm_do_selftest(chip); - - return ret; -} - static struct pnp_device_id tpm_pnp_tbl[] = { {"PNP0C31", 0}, /* TPM */ {"ATM1200", 0}, /* Atmel */ @@ -835,9 +834,12 @@ static struct pnp_driver tis_pnp_driver = { .name = "tpm_tis", .id_table = tpm_pnp_tbl, .probe = tpm_tis_pnp_init, - .suspend = tpm_tis_pnp_suspend, - .resume = tpm_tis_pnp_resume, .remove = tpm_tis_pnp_remove, +#ifdef CONFIG_PM_SLEEP + .driver = { + .pm = &tpm_tis_pm, + }, +#endif }; #define TIS_HID_USR_IDX sizeof(tpm_pnp_tbl)/sizeof(struct pnp_device_id) -2 @@ -846,20 +848,6 @@ module_param_string(hid, tpm_pnp_tbl[TIS_HID_USR_IDX].id, MODULE_PARM_DESC(hid, "Set additional specific HID for this driver to probe"); #endif -#ifdef CONFIG_PM_SLEEP -static int tpm_tis_resume(struct device *dev) -{ - struct tpm_chip *chip = dev_get_drvdata(dev); - - if (chip->vendor.irq) - tpm_tis_reenable_interrupts(chip); - - return tpm_pm_resume(dev); -} -#endif - -static SIMPLE_DEV_PM_OPS(tpm_tis_pm, tpm_pm_suspend, tpm_tis_resume); - static struct platform_driver tis_drv = { .driver = { .name = "tpm_tis", From 8aa6c2166b5184fb2344062cf2fa229b197c1f84 Mon Sep 17 00:00:00 2001 From: Shuah Khan Date: Wed, 11 Sep 2013 14:23:15 -0700 Subject: [PATCH 148/303] platform: convert apple-gmux driver to dev_pm_ops from legacy pm_ops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convert drivers/platform/x86/apple-gmux to use dev_pm_ops instead of legacy pm_ops. This patch depends on pnp driver bus ops change to invoke pnp_driver dev_pm_ops. Signed-off-by: Shuah Khan Cc: Matthew Garrett Cc: Leonidas Da Silva Barbosa Cc: Ashley Lai Cc: Rajiv Andrade Cc: Marcel Selhorst Cc: Sirrix AG Cc: Alessandro Zummo Cc: "Rafael J. Wysocki" Cc: Bjorn Helgaas Cc: Grant Likely Cc: Rob Herring Cc: Peter Hüwe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/platform/x86/apple-gmux.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/apple-gmux.c b/drivers/platform/x86/apple-gmux.c index f74bfcbb7bad..8eea2efbbb6d 100644 --- a/drivers/platform/x86/apple-gmux.c +++ b/drivers/platform/x86/apple-gmux.c @@ -393,17 +393,21 @@ static void gmux_notify_handler(acpi_handle device, u32 value, void *context) complete(&gmux_data->powerchange_done); } -static int gmux_suspend(struct pnp_dev *pnp, pm_message_t state) +static int gmux_suspend(struct device *dev) { + struct pnp_dev *pnp = to_pnp_dev(dev); struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp); + gmux_data->resume_client_id = gmux_active_client(gmux_data); gmux_disable_interrupts(gmux_data); return 0; } -static int gmux_resume(struct pnp_dev *pnp) +static int gmux_resume(struct device *dev) { + struct pnp_dev *pnp = to_pnp_dev(dev); struct apple_gmux_data *gmux_data = pnp_get_drvdata(pnp); + gmux_enable_interrupts(gmux_data); gmux_switchto(gmux_data->resume_client_id); if (gmux_data->power_state == VGA_SWITCHEROO_OFF) @@ -605,13 +609,19 @@ static const struct pnp_device_id gmux_device_ids[] = { {"", 0} }; +static const struct dev_pm_ops gmux_dev_pm_ops = { + .suspend = gmux_suspend, + .resume = gmux_resume, +}; + static struct pnp_driver gmux_pnp_driver = { .name = "apple-gmux", .probe = gmux_probe, .remove = gmux_remove, .id_table = gmux_device_ids, - .suspend = gmux_suspend, - .resume = gmux_resume + .driver = { + .pm = &gmux_dev_pm_ops, + }, }; static int __init apple_gmux_init(void) From 20d0e57017b69e7e4ae7166c43f3a3f023ab9702 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:23:17 -0700 Subject: [PATCH 149/303] drivers/firmware/google/gsmi.c: replace strict_strtoul() with kstrtoul() The use of strict_strtoul() is not preferred, because strict_strtoul() is obsolete. Thus, kstrtoul() should be used. Signed-off-by: Jingoo Han Cc: Matt Fleming Cc: Tom Gundersen Cc: Mike Waychison Acked-by: Mike Waychison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/google/gsmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c index acba0b9f4406..6eb535ffeddc 100644 --- a/drivers/firmware/google/gsmi.c +++ b/drivers/firmware/google/gsmi.c @@ -525,7 +525,7 @@ static ssize_t gsmi_clear_eventlog_store(struct kobject *kobj, u32 data_type; } param; - rc = strict_strtoul(buf, 0, &val); + rc = kstrtoul(buf, 0, &val); if (rc) return rc; From 3ddc5b46a8e90f3c9251338b60191d0a804b0d92 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 11 Sep 2013 14:23:18 -0700 Subject: [PATCH 150/303] kernel-wide: fix missing validations on __get/__put/__copy_to/__copy_from_user() I found the following pattern that leads in to interesting findings: grep -r "ret.*|=.*__put_user" * grep -r "ret.*|=.*__get_user" * grep -r "ret.*|=.*__copy" * The __put_user() calls in compat_ioctl.c, ptrace compat, signal compat, since those appear in compat code, we could probably expect the kernel addresses not to be reachable in the lower 32-bit range, so I think they might not be exploitable. For the "__get_user" cases, I don't think those are exploitable: the worse that can happen is that the kernel will copy kernel memory into in-kernel buffers, and will fail immediately afterward. The alpha csum_partial_copy_from_user() seems to be missing the access_ok() check entirely. The fix is inspired from x86. This could lead to information leak on alpha. I also noticed that many architectures map csum_partial_copy_from_user() to csum_partial_copy_generic(), but I wonder if the latter is performing the access checks on every architectures. Signed-off-by: Mathieu Desnoyers Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Jens Axboe Cc: Oleg Nesterov Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/lib/csum_partial_copy.c | 5 +++ arch/sparc/kernel/sys_sparc32.c | 12 +++---- block/compat_ioctl.c | 2 +- kernel/signal.c | 4 +-- net/socket.c | 50 +++++++++++++++--------------- 5 files changed, 39 insertions(+), 34 deletions(-) diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index 40736da9bea8..ffb19b7da999 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -338,6 +338,11 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, unsigned long doff = 7 & (unsigned long) dst; if (len) { + if (!access_ok(VERIFY_READ, src, len)) { + *errp = -EFAULT; + memset(dst, 0, len); + return sum; + } if (!doff) { if (!soff) checksum = csum_partial_cfu_aligned( diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 3d0ddbc005fe..71368850dfc0 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -169,10 +169,10 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig, new_ka.ka_restorer = restorer; ret = get_user(u_handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(u_handler); - ret |= __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); + ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); sigset_from_compat(&new_ka.sa.sa_mask, &set32); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); - ret |= __get_user(u_restorer, &act->sa_restorer); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(u_restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(u_restorer); if (ret) return -EFAULT; @@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig, if (!ret && oact) { sigset_to_compat(&set32, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - ret |= __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); + ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); if (ret) ret = -EFAULT; } diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7e5d474dc6ba..fbd5a67cb773 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -70,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, return ret; ret = copy_to_user(ugeo, &geo, 4); - ret |= __put_user(geo.start, &ugeo->start); + ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; diff --git a/kernel/signal.c b/kernel/signal.c index 50e41075ac77..ded28b91fa53 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; sigset_from_compat(&new_ka.sa.sa_mask, &mask); @@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); diff --git a/net/socket.c b/net/socket.c index b2d7c629eeb9..0ceaa5cb9ead 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3072,12 +3072,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, uifmap32 = &uifr32->ifr_ifru.ifru_map; err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); - err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= __get_user(ifr.ifr_map.port, &uifmap32->port); + err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= get_user(ifr.ifr_map.port, &uifmap32->port); if (err) return -EFAULT; @@ -3088,12 +3088,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, if (cmd == SIOCGIFMAP && !err) { err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); - err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= __put_user(ifr.ifr_map.port, &uifmap32->port); + err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= put_user(ifr.ifr_map.port, &uifmap32->port); if (err) err = -EFAULT; } @@ -3167,25 +3167,25 @@ static int routing_ioctl(struct net *net, struct socket *sock, struct in6_rtmsg32 __user *ur6 = argp; ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), 3 * sizeof(struct in6_addr)); - ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); - ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); - ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); - ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); - ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); - ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); - ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); + ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); + ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); + ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); + ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); + ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); + ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); + ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); r = (void *) &r6; } else { /* ipv4 */ struct rtentry32 __user *ur4 = argp; ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), 3 * sizeof(struct sockaddr)); - ret |= __get_user(r4.rt_flags, &(ur4->rt_flags)); - ret |= __get_user(r4.rt_metric, &(ur4->rt_metric)); - ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu)); - ret |= __get_user(r4.rt_window, &(ur4->rt_window)); - ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt)); - ret |= __get_user(rtdev, &(ur4->rt_dev)); + ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); + ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); + ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); + ret |= get_user(r4.rt_window, &(ur4->rt_window)); + ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); + ret |= get_user(rtdev, &(ur4->rt_dev)); if (rtdev) { ret |= copy_from_user(devname, compat_ptr(rtdev), 15); r4.rt_dev = (char __user __force *)devname; From 54a33b1b1470ada14fa2998e8b48ad4a0ef6a916 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 11 Sep 2013 14:23:19 -0700 Subject: [PATCH 151/303] kernel/modsign_pubkey.c: fix init const for module signing code const has to use __initconst, not __initdata Signed-off-by: Andi Kleen Acked-by: David Howells Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/modsign_pubkey.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 2b6e69909c39..7cbd4507a7e6 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -18,14 +18,14 @@ struct key *modsign_keyring; -extern __initdata const u8 modsign_certificate_list[]; -extern __initdata const u8 modsign_certificate_list_end[]; +extern __initconst const u8 modsign_certificate_list[]; +extern __initconst const u8 modsign_certificate_list_end[]; /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice * if modsign.pub changes. */ -static __initdata const char annoy_ccache[] = __TIME__ "foo"; +static __initconst const char annoy_ccache[] = __TIME__ "foo"; /* * Load the compiled-in keys From a6b088875b5cfc2be95242826f31523214c083a7 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 11 Sep 2013 14:23:20 -0700 Subject: [PATCH 152/303] lto, watchdog/hpwdt.c: make assembler label global We cannot assume that the inline assembler code always ends up in the same file as the original C file. So make any assembler labels that are called with "extern" by C global Signed-off-by: Andi Kleen Cc: Wim Van Sebroeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/watchdog/hpwdt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index de7e4f497222..5be5e3d14f79 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -162,7 +162,8 @@ extern asmlinkage void asminline_call(struct cmn_registers *pi86Regs, #define HPWDT_ARCH 32 asm(".text \n\t" - ".align 4 \n" + ".align 4 \n\t" + ".globl asminline_call \n" "asminline_call: \n\t" "pushl %ebp \n\t" "movl %esp, %ebp \n\t" @@ -352,7 +353,8 @@ static int detect_cru_service(void) #define HPWDT_ARCH 64 asm(".text \n\t" - ".align 4 \n" + ".align 4 \n\t" + ".globl asminline_call \n" "asminline_call: \n\t" "pushq %rbp \n\t" "movq %rsp, %rbp \n\t" From bc5c8f0783a4a2b43d05155782e71a22a91b26a5 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 11 Sep 2013 14:23:21 -0700 Subject: [PATCH 153/303] fs/bio-integrity: fix a potential mem leak Free the bio_integrity_pool in the fail path of biovec_create_pool in function bioset_integrity_create(). Signed-off-by: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/bio-integrity.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8fb42916d8a2..60250847929f 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -716,14 +716,15 @@ int bioset_integrity_create(struct bio_set *bs, int pool_size) return 0; bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab); - - bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); - if (!bs->bvec_integrity_pool) - return -1; - if (!bs->bio_integrity_pool) return -1; + bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size); + if (!bs->bvec_integrity_pool) { + mempool_destroy(bs->bio_integrity_pool); + return -1; + } + return 0; } EXPORT_SYMBOL(bioset_integrity_create); From 60c323699bb308404dcb60e8808531e02651578a Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 11 Sep 2013 14:23:22 -0700 Subject: [PATCH 154/303] kernel/smp.c: free related resources when failure occurs in hotplug_cfd() When failure occurs in hotplug_cfd(), need release related resources, or will cause memory leak. Signed-off-by: Chen Gang Acked-by: Wang YanQing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/kernel/smp.c b/kernel/smp.c index 449b707fc20d..3bb6ae533cdf 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { + free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } From c14c338cb05c700a260480c197cfd6da8f8b7d2e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 11 Sep 2013 14:23:23 -0700 Subject: [PATCH 155/303] kernel/spinlock.c: add default arch_*_relax definitions for GENERIC_LOCKBREAK When running with GENERIC_LOCKBREAK=y, the locking implementations emit calls to arch_{read,write,spin}_relax when spinning on a contended lock in order to allow architectures to favour the CPU owning the lock if possible. In reality, everybody apart from PowerPC and S390 just does cpu_relax() here, so make that the default behaviour and allow it to be overridden if required. Signed-off-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd8065a3ce..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -34,6 +34,20 @@ #else #define raw_read_can_lock(l) read_can_lock(l) #define raw_write_can_lock(l) write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l) cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l) cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l) cpu_relax() +#endif + /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function From fa688207c9db48b64ab6538abc3fcdf26110b9ec Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:24 -0700 Subject: [PATCH 156/303] smp: quit unconditionally enabling irq in on_each_cpu_mask and on_each_cpu_cond As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in !SMP version of on_each_cpu()"), we don't want to enable irqs if they are not already enabled. There are currently no known problematical callers of these functions, but since it is a known failure pattern, we preemptively fix them. Since they are not trivial functions, make them non-inline by moving them to up.c. This also makes it so we don't have to fix #include dependancies for preempt_{disable,enable}. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 62 ++++++++++++--------------------------------- kernel/up.c | 39 ++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 46 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index c8488763277f..3724a9070907 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -29,6 +29,22 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + #ifdef CONFIG_SMP #include @@ -100,22 +116,6 @@ static inline void call_function_init(void) { } */ int on_each_cpu(smp_call_func_t func, void *info, int wait); -/* - * Call a function on processors specified by mask, which might include - * the local one. - */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait); - -/* - * Call a function on each processor for which the supplied function - * cond_func returns a positive value. This may include the local - * processor. - */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags); - /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -151,36 +151,6 @@ static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) return 0; } -/* - * Note we still need to test the mask even for UP - * because we actually can get an empty mask from - * code that on SMP might call us without the local - * CPU in the mask. - */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { \ - if (cpumask_test_cpu(0, (mask))) { \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - } \ - } while (0) -/* - * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. - */ -#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ - do { \ - void *__info = (info); \ - preempt_disable(); \ - if ((cond_func)(0, __info)) { \ - local_irq_disable(); \ - (func)(__info); \ - local_irq_enable(); \ - } \ - preempt_enable(); \ - } while (0) - static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/up.c b/kernel/up.c index c54c75e9faf7..144e57255234 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -19,3 +19,42 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, return 0; } EXPORT_SYMBOL(smp_call_function_single); + +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +void on_each_cpu_mask(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + unsigned long flags; + + if (cpumask_test_cpu(0, mask)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + unsigned long flags; + + preempt_disable(); + if (cond_func(0, info)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } + preempt_enable(); +} +EXPORT_SYMBOL(on_each_cpu_cond); From 081192b25c2d4620b5f5838620624d3daee94b66 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:25 -0700 Subject: [PATCH 157/303] up.c: use local_irq_{save,restore}() in smp_call_function_single. The SMP version of this function doesn't unconditionally enable irqs, so neither should this !SMP version. There are no know problems caused by this, but we make the change for consistency's sake. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/up.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/up.c b/kernel/up.c index 144e57255234..b1cf036255f3 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -10,11 +10,13 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { + unsigned long flags; + WARN_ON(cpu != 0); - local_irq_disable(); - (func)(info); - local_irq_enable(); + local_irq_save(flags); + func(info); + local_irq_restore(flags); return 0; } From bff2dc42bcafdd75c0296987747f782965d691a0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:26 -0700 Subject: [PATCH 158/303] smp.h: move !SMP version of on_each_cpu() out-of-line All of the other non-trivial !SMP versions of functions in smp.h are out-of-line in up.c. Move on_each_cpu() there as well. This allows us to get rid of the #include . The drawback is that this makes both the x86_64 and i386 defconfig !SMP kernels about 200 bytes larger each. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 21 +++++---------------- kernel/up.c | 11 +++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/smp.h b/include/linux/smp.h index 3724a9070907..cfb7ca094b38 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,7 +11,6 @@ #include #include #include -#include extern void cpu_idle(void); @@ -29,6 +28,11 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on all processors + */ +int on_each_cpu(smp_call_func_t func, void *info, int wait); + /* * Call a function on processors specified by mask, which might include * the local one. @@ -111,11 +115,6 @@ void generic_smp_call_function_single_interrupt(void); static inline void call_function_init(void) { } #endif -/* - * Call a function on all processors - */ -int on_each_cpu(smp_call_func_t func, void *info, int wait); - /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -141,16 +140,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - local_irq_save(flags); - func(info); - local_irq_restore(flags); - return 0; -} - static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/up.c b/kernel/up.c index b1cf036255f3..630d72bf7e41 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} +EXPORT_SYMBOL(on_each_cpu); + /* * Note we still need to test the mask even for UP * because we actually can get an empty mask from From e656a634118285142063527b2cd40c749036de82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Wed, 11 Sep 2013 14:23:27 -0700 Subject: [PATCH 159/303] extable: skip sorting if the table is empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least on ARM no-MMU the extable is empty and so there is nothing to sort. So add a check for the table to be empty which effectively only changes that the misleading pr_notice is suppressed. Signed-off-by: Uwe Kleine-König Cc: Ingo Molnar Cc: David Daney Cc: "H. Peter Anvin" Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/extable.c b/kernel/extable.c index 67460b93b1a1..832cb28105bb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) { + if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } From f9597f24c089dcbddbd2d9e99fbf00df57fb70c6 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Wed, 11 Sep 2013 14:23:28 -0700 Subject: [PATCH 160/303] syscalls.h: add forward declarations for inplace syscall wrappers Unclutter -Wmissing-prototypes warning types (enabled at make W=1) linux/include/linux/syscalls.h:190:18: warning: no previous prototype for 'SyS_semctl' [-Wmissing-prototypes] asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ ^ linux/include/linux/syscalls.h:183:2: note: in expansion of macro '__SYSCALL_DEFINEx' __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) ^ by adding forward declarations right before definitions. Signed-off-by: Sergei Trofimovich Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 1 + include/linux/syscalls.h | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/compat.h b/include/linux/compat.h index ec1aee4aec9c..345da00a86e0 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -43,6 +43,7 @@ #define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ + asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));\ asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\ { \ return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__)); \ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 84662ecc7b51..7fac04e7ff6e 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -186,6 +186,7 @@ extern struct trace_event_functions exit_syscall_print_funcs; #define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ { \ long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ From 202da400570d991bacda4a06e878cb901e96a783 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:29 -0700 Subject: [PATCH 161/303] kernel/smp.c: quit unconditionally enabling irqs in on_each_cpu_mask(). As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in !SMP version of on_each_cpu()"), we don't want to enable irqs if they are not already enabled. I don't know of any bugs currently caused by this unconditional local_irq_enable(), but I want to use this function in MIPS/OCTEON early boot (when we have early_boot_irqs_disabled). This also makes this function have similar semantics to on_each_cpu() which is good in itself. Signed-off-by: David Daney Cc: Gilad Ben-Yossef Cc: Christoph Lameter Cc: Chris Metcalf Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/smp.c b/kernel/smp.c index 3bb6ae533cdf..0564571dcdf7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -575,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu); * * If @wait is true, then returns once @func has returned. * - * You must not call this function with disabled interrupts or - * from a hardware interrupt handler or from a bottom half handler. + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. The + * exception is that it may be used during early boot while + * early_boot_irqs_disabled is set. */ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) @@ -585,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, smp_call_function_many(mask, func, info, wait); if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); + unsigned long flags; + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); } put_cpu(); } From 205e550a0fb469ae73f91a903f27f4f63e774037 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:23:30 -0700 Subject: [PATCH 162/303] task_work: minor cleanups Trivial. Remove the unnecessary "work = NULL" initialization and turn read_barrier_depends() into smp_read_barrier_depends() in task_work_cancel(). Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/task_work.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/task_work.c b/kernel/task_work.c index 65bd3c92d6f3..6ee09856f725 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,7 +25,7 @@ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { struct callback_head **pprev = &task->task_works; - struct callback_head *work = NULL; + struct callback_head *work; unsigned long flags; /* * If cmpxchg() fails we continue without updating pprev. @@ -35,7 +35,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) */ raw_spin_lock_irqsave(&task->pi_lock, flags); while ((work = ACCESS_ONCE(*pprev))) { - read_barrier_depends(); + smp_read_barrier_depends(); if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) From 892f6668f3a7088c7e30049c3d8e1844531602dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:23:31 -0700 Subject: [PATCH 163/303] task_work: documentation No functional changes, just comments. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/task_work.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/kernel/task_work.c b/kernel/task_work.c index 6ee09856f725..8727032e3a6f 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -4,6 +4,23 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/** + * task_work_add - ask the @task to execute @work->func() + * @task: the task which should run the callback + * @work: the callback to run + * @notify: send the notification if true + * + * Queue @work for task_work_run() below and notify the @task if @notify. + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task returns from kernel + * mode or exits. + * + * This is like the signal handler which runs in kernel mode, but it doesn't + * try to wake up the @task. + * + * RETURNS: + * 0 if succeeds or -ESRCH. + */ int task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { @@ -21,6 +38,17 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) return 0; } +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @func: identifies the work to remove + * + * Find the last queued pending work with ->func == @func and remove + * it from queue. + * + * RETURNS: + * The found work or NULL if not found. + */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { @@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) return work; } +/** + * task_work_run - execute the works added by task_work_add() + * + * Flush the pending works. Should be used by the core kernel code. + * Called before the task returns to the user-mode or stops, or when + * it exits. In the latter case task_work_add() can no longer add the + * new work after task_work_run() returns. + */ void task_work_run(void) { struct task_struct *task = current; From 9cedc3d51f30a7695264d0c5629b584b45e3938d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:32 -0700 Subject: [PATCH 164/303] MAINTAINERS: EXYNOS: remove board files Commit ca9143501c30 ("ARM: EXYNOS: Remove unused board files") removed the files, remove the patterns too. Signed-off-by: Joe Perches Cc: Tomasz Figa Acked-by: Kyungmin Park Cc: Kukjin Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index d721af119ff9..c281e0f7cf80 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1179,8 +1179,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-s5pv210/mach-aquila.c F: arch/arm/mach-s5pv210/mach-goni.c -F: arch/arm/mach-exynos/mach-universal_c210.c -F: arch/arm/mach-exynos/mach-nuri.c ARM/SAMSUNG S5P SERIES 2D GRAPHICS ACCELERATION (G2D) SUPPORT M: Kyungmin Park From 34b273ecb60c48eb8d9da129696ed216e60071a8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:33 -0700 Subject: [PATCH 165/303] MAINTAINERS: ARM: OMAP2/3: remove unused clockdomain files Commit 4bd5259e53ac ("ARM: OMAP2/3: clockdomain/PRM/CM: move the low-level clockdomain functions into PRM/CM") deleted the files, update the pattern. Identical to a patch earlier sent by Cesar Eduardo Barros. Signed-off-by: Joe Perches Cc: Paul Walmsley Cc: Rajendra Nayak Cc: Santosh Shilimkar Cc: Cesar Eduardo Barros Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index c281e0f7cf80..930182bf3576 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5968,8 +5968,6 @@ L: linux-omap@vger.kernel.org S: Maintained F: arch/arm/mach-omap2/powerdomain2xxx_3xxx.c F: arch/arm/mach-omap2/powerdomain44xx.c -F: arch/arm/mach-omap2/clockdomain2xxx_3xxx.c -F: arch/arm/mach-omap2/clockdomain44xx.c OMAP AUDIO SUPPORT M: Peter Ujfalusi From d21db568d78cedc059d4f7a706a8502e688341e7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:35 -0700 Subject: [PATCH 166/303] MAINTAINERS: OMAP POWERDOMAIN, update patterns Commit 498153995b9f ("ARM: OMAP2+: powerdomain/PRM: move the low-level powerdomain") renamed the files, update the patterns. Identical to a patch earlier sent by Cesar Eduardo Barros. Signed-off-by: Joe Perches Cc: Paul Walmsley Cc: Rajendra Nayak Cc: Santosh Shilimkar Cc: Cesar Eduardo Barros Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 930182bf3576..440d5b112377 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5961,13 +5961,12 @@ S: Maintained F: arch/arm/*omap*/*pm* F: drivers/cpufreq/omap-cpufreq.c -OMAP POWERDOMAIN/CLOCKDOMAIN SOC ADAPTATION LAYER SUPPORT +OMAP POWERDOMAIN SOC ADAPTATION LAYER SUPPORT M: Rajendra Nayak M: Paul Walmsley L: linux-omap@vger.kernel.org S: Maintained -F: arch/arm/mach-omap2/powerdomain2xxx_3xxx.c -F: arch/arm/mach-omap2/powerdomain44xx.c +F: arch/arm/mach-omap2/prm* OMAP AUDIO SUPPORT M: Peter Ujfalusi From 15dba38737af134ddcac3270448c5e15eff2e323 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:36 -0700 Subject: [PATCH 167/303] MAINTAINERS: ARM: S3C2410: update patterns Commit 85fd6d63bf29 ("ARM: S3C2410: move mach-s3c2410/* into mach-s3c24xx/") moved the files, update the patterns. Signed-off-by: Joe Perches Acked-by: Kukjin Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 440d5b112377..342fe744ac96 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7540,9 +7540,9 @@ P: Vincent Sanders M: Simtec Linux Team W: http://www.simtec.co.uk/products/EB2410ITX/ S: Supported -F: arch/arm/mach-s3c2410/mach-bast.c -F: arch/arm/mach-s3c2410/bast-ide.c -F: arch/arm/mach-s3c2410/bast-irq.c +F: arch/arm/mach-s3c24xx/mach-bast.c +F: arch/arm/mach-s3c24xx/bast-ide.c +F: arch/arm/mach-s3c24xx/bast-irq.c TI DAVINCI MACHINE SUPPORT M: Sekhar Nori From 281e192f5e4db2a0fefcc04de2fabaa7a16294c8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:37 -0700 Subject: [PATCH 168/303] MAINTAINERS: ARM: spear: consolidate sections Commit a7ed099ffc8e ("ARM: spear: move all files to mach-spear") moved all the files into a single directory, delete the now unnecessary duplicate sections and update the pattern. Signed-off-by: Joe Perches Cc: Arnd Bergmann Acked-by: Viresh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 342fe744ac96..3f83a6de463e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7815,35 +7815,7 @@ L: spear-devel@list.st.com L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.st.com/spear S: Maintained -F: arch/arm/plat-spear/ - -SPEAR13XX MACHINE SUPPORT -M: Viresh Kumar -M: Shiraz Hashim -L: spear-devel@list.st.com -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.st.com/spear -S: Maintained -F: arch/arm/mach-spear13xx/ - -SPEAR3XX MACHINE SUPPORT -M: Viresh Kumar -M: Shiraz Hashim -L: spear-devel@list.st.com -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.st.com/spear -S: Maintained -F: arch/arm/mach-spear3xx/ - -SPEAR6XX MACHINE SUPPORT -M: Rajeev Kumar -M: Shiraz Hashim -M: Viresh Kumar -L: spear-devel@list.st.com -L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -W: http://www.st.com/spear -S: Maintained -F: arch/arm/mach-spear6xx/ +F: arch/arm/mach-spear/ SPEAR CLOCK FRAMEWORK SUPPORT M: Viresh Kumar From 5173413a15775d65982ab8a5f8e905a2aa529a87 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:38 -0700 Subject: [PATCH 169/303] MAINTAINERS: ARM: plat-nomadik: update patterns Commit 694e33a7f42d ("ARM: plat-nomadik: move MTU, kill plat-nomadik") moved the files, update the patterns. Signed-off-by: Joe Perches Reviewed-by: Linus Walleij Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3f83a6de463e..f1d176cd68e0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1048,7 +1048,6 @@ M: STEricsson L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-nomadik/ -F: arch/arm/plat-nomadik/ F: drivers/i2c/busses/i2c-nomadik.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/linusw/linux-nomadik.git From c54ec9d3698b006fad5e942ae169fb5f079ccfc4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:39 -0700 Subject: [PATCH 170/303] MAINTAINERS: ARM: S3C24XX: remove plat-s3c24xx Commit 09ec1d7ea67f ("ARM: S3C24XX: Remove plat-s3c24xx directory in arch/arm/") moved the files, remove the pattern. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f1d176cd68e0..064daa840244 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1155,7 +1155,6 @@ L: linux-samsung-soc@vger.kernel.org (moderated for non-subscribers) W: http://www.fluff.org/ben/linux/ S: Maintained F: arch/arm/plat-samsung/ -F: arch/arm/plat-s3c24xx/ F: arch/arm/mach-s3c24*/ F: arch/arm/mach-s3c64xx/ F: drivers/*/*s3c2410* From 2caa67a652c95c0175b7fb645e2924dad726b1ba Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:40 -0700 Subject: [PATCH 171/303] MAINTAINERS: ghes_edac: update pattern Commit 77c5f5d2f212 ("ghes_edac: Register at EDAC core the BIOS report") typoed the file pattern. Fix it. Signed-off-by: Joe Perches Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 064daa840244..141a0710c3ff 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3034,7 +3034,7 @@ M: Mauro Carvalho Chehab L: linux-edac@vger.kernel.org W: bluesmoke.sourceforge.net S: Maintained -F: drivers/edac/ghes-edac.c +F: drivers/edac/ghes_edac.c EDAC-I82443BXGX M: Tim Small From 81a66488596f81d412c0e391a081f8b6778cdc9c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:41 -0700 Subject: [PATCH 172/303] MAINTAINERS: update SIANO drivers Commit 786baecfe78f ("[media] dvb-usb: move it to drivers/media/usb/dvb-usb") moved the files, update the pattern. Signed-off-by: Joe Perches Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 141a0710c3ff..082272821476 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7498,7 +7498,7 @@ W: http://linuxtv.org T: git git://linuxtv.org/media_tree.git S: Odd fixes F: drivers/media/common/siano/ -F: drivers/media/dvb/siano/ +F: drivers/media/usb/siano/ F: drivers/media/usb/siano/ F: drivers/media/mmc/siano From 559cdc828b51cace87bd297f5b92a9e8da9e1ba4 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:42 -0700 Subject: [PATCH 173/303] MAINTAINERS: SI4713: fix file pattern Commit c937ca034a03 ("[media] MAINTAINERS: Add maintainer entry for si4713 FM transmitter driver") typoed the pattern, fix it. Signed-off-by: Joe Perches Acked-by: Eduardo Valentin Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 082272821476..4835a9bdf95d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7489,7 +7489,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git W: http://linuxtv.org S: Odd Fixes -F: drivers/media/radio/radio-si4713.h +F: drivers/media/radio/radio-si4713.c SIANO DVB DRIVER M: Mauro Carvalho Chehab From 9d9fb74499c3ae468691d1118d3169da8a5b3857 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:43 -0700 Subject: [PATCH 174/303] MAINTAINERS: update it913x patterns Commit d7104bffcfb7 ("[media] MAINTAINERS: add drivers/media/tuners/it913x*") used the incorrect file patterns. Fix it. Signed-off-by: Joe Perches Acked-by: Antti Palosaari Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4835a9bdf95d..198a228caaaa 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4623,7 +4623,7 @@ W: http://palosaari.fi/linux/ Q: http://patchwork.linuxtv.org/project/linux-media/list/ T: git git://linuxtv.org/anttip/media_tree.git S: Maintained -F: drivers/media/tuners/it913x* +F: drivers/media/tuners/tuner_it913x* IVTV VIDEO4LINUX DRIVER M: Andy Walls From 31a12b317804720e75bf68accc8f12fac24fcf64 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:44 -0700 Subject: [PATCH 175/303] MAINTAINERS: update ssbi patterns Commit 45fcac1aad5d ("mfd: Move ssbi driver into drivers/mfd") move the files, update the patterns. Signed-off-by: Joe Perches Cc: Arnd Bergmann Cc: Samuel Ortiz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 198a228caaaa..5804efdea32a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1069,7 +1069,7 @@ F: drivers/mmc/host/msm_sdcc.h F: drivers/tty/serial/msm_serial.h F: drivers/tty/serial/msm_serial.c F: drivers/*/pm8???-* -F: drivers/ssbi/ +F: drivers/mfd/ssbi/ F: include/linux/mfd/pm8xxx/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/davidb/linux-msm.git S: Maintained From c6a0fe4a08c5190c8e36547526529a21c3597149 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:45 -0700 Subject: [PATCH 176/303] MAINTAINERS: update file pattern for ARC uart Commit 6659a20a76e0 ("ARC: MAINTAINERS update for ARC") typoed the file pattern. Fix it. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5804efdea32a..a26049f63e7f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8082,7 +8082,7 @@ M: Vineet Gupta S: Supported F: arch/arc/ F: Documentation/devicetree/bindings/arc/ -F: drivers/tty/serial/arc-uart.c +F: drivers/tty/serial/arc_uart.c SYSV FILESYSTEM M: Christoph Hellwig From 4f31102bbba356b907dc4ab2ddaf243a132b9ffd Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:46 -0700 Subject: [PATCH 177/303] MAINTAINERS: update USB EHCI platform pattern Commit f3bc64d6d1f2 ("USB: EHCI: DT support for generic bus glue") removed the ehci-vt8500.c file, update the file pattern to include ehci-platform.c. Signed-off-by: Joe Perches Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index a26049f63e7f..0a8418219c61 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1321,7 +1321,7 @@ F: drivers/mmc/host/wmt-sdmmc.c F: drivers/pwm/pwm-vt8500.c F: drivers/rtc/rtc-vt8500.c F: drivers/tty/serial/vt8500_serial.c -F: drivers/usb/host/ehci-vt8500.c +F: drivers/usb/host/ehci-platform.c F: drivers/usb/host/uhci-platform.c F: drivers/video/vt8500lcdfb.* F: drivers/video/wm8505fb* From 89f55bd74de13b45a239b2088bfcd803b0cf53c3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:47 -0700 Subject: [PATCH 178/303] MAINTAINERS: usb: phy: update patterns Commit a0e631235a04 ("usb: phy: move all PHY drivers to drivers/usb/phy/") deleted the files, remove the file pattern. Signed-off-by: Joe Perches Acked-by: Felipe Balbi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0a8418219c61..5221b953ee19 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8772,7 +8772,6 @@ L: linux-usb@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git S: Maintained F: drivers/usb/phy/ -F: drivers/usb/otg/ USB PRINTER DRIVER (usblp) M: Pete Zaitcev From 11c26770eb0296956a6b17595e26f3a8eab3677a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:48 -0700 Subject: [PATCH 179/303] MAINTAINERS: update GRE DEMUX patterns Commit c50cd357887a ("net: gre: move GSO functions to gre_offload") renamed and separated the file into multiple files. Update the patterns. Signed-off-by: Joe Perches Cc: Dmitry Kozlov Cc: Daniel Borkmann Cc: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5221b953ee19..516777f7d6af 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3683,7 +3683,8 @@ GRE DEMULTIPLEXER DRIVER M: Dmitry Kozlov L: netdev@vger.kernel.org S: Maintained -F: net/ipv4/gre.c +F: net/ipv4/gre_demux.c +F: net/ipv4/gre_offload.c F: include/net/gre.h GRETH 10/100/1G Ethernet MAC device driver From af4b8e371b0f692a89f37d7ae5b23e8804c81a17 Mon Sep 17 00:00:00 2001 From: Christian Daudt Date: Wed, 11 Sep 2013 14:23:49 -0700 Subject: [PATCH 180/303] MAINTAINERS: add mach-bcm and drivers Add ownership to maintainers file for the mach-bcm related files, including drivers that are used for the SoCs defined in mach-bcm. Signed-off-by: Christian Daudt Cc: Olof Johansson Cc: Arnd Bergmann Cc: Stephen Warren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 516777f7d6af..726d22258697 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1811,6 +1811,17 @@ L: netdev@vger.kernel.org S: Supported F: drivers/net/ethernet/broadcom/bnx2x/ +BROADCOM BCM281XX/BCM11XXX ARM ARCHITECTURE +M: Christian Daudt +T: git git://git.github.com/broadcom/bcm11351 +S: Maintained +F: arch/arm/mach-bcm/ +F: arch/arm/boot/dts/bcm113* +F: arch/arm/boot/dts/bcm281* +F: arch/arm/configs/bcm_defconfig +F: drivers/mmc/host/sdhci_bcm_kona.c +F: drivers/clocksource/bcm_kona_timer.c + BROADCOM BCM2835 ARM ARCHICTURE M: Stephen Warren L: linux-rpi-kernel@lists.infradead.org (moderated for non-subscribers) From 144308139cdce95e9c6cff3cc0ef12242181665f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:50 -0700 Subject: [PATCH 181/303] MAINTAINERS: append "/" to directory patterns It's clearer to have patterns marked as directories. Change the directory patterns without terminating slashes. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 726d22258697..f8c41ae6c9a4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1028,7 +1028,7 @@ F: arch/arm/mach-orion5x/ts78xx-* ARM/MICREL KS8695 ARCHITECTURE M: Greg Ungerer L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) -F: arch/arm/mach-ks8695 +F: arch/arm/mach-ks8695/ S: Odd Fixes ARM/MIOA701 MACHINE SUPPORT @@ -2042,10 +2042,10 @@ W: http://ceph.com/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git S: Supported F: Documentation/filesystems/ceph.txt -F: fs/ceph -F: net/ceph -F: include/linux/ceph -F: include/linux/crush +F: fs/ceph/ +F: net/ceph/ +F: include/linux/ceph/ +F: include/linux/crush/ CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: L: linux-usb@vger.kernel.org @@ -2342,7 +2342,7 @@ CPU POWER MONITORING SUBSYSTEM M: Dominik Brodowski M: Thomas Renninger S: Maintained -F: tools/power/cpupower +F: tools/power/cpupower/ CPUSETS M: Li Zefan @@ -2780,7 +2780,7 @@ L: intel-gfx@lists.freedesktop.org L: dri-devel@lists.freedesktop.org T: git git://people.freedesktop.org/~danvet/drm-intel S: Supported -F: drivers/gpu/drm/i915 +F: drivers/gpu/drm/i915/ F: include/drm/i915* F: include/uapi/drm/i915* @@ -2792,7 +2792,7 @@ M: Kyungmin Park L: dri-devel@lists.freedesktop.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos.git S: Supported -F: drivers/gpu/drm/exynos +F: drivers/gpu/drm/exynos/ F: include/drm/exynos* F: include/uapi/drm/exynos* @@ -3651,8 +3651,8 @@ M: Arnd Bergmann L: linux-arch@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git S: Maintained -F: include/asm-generic -F: include/uapi/asm-generic +F: include/asm-generic/ +F: include/uapi/asm-generic/ GENERIC UIO DRIVER FOR PCI DEVICES M: "Michael S. Tsirkin" @@ -3773,7 +3773,7 @@ L: linux-media@vger.kernel.org T: git git://linuxtv.org/media_tree.git W: http://linuxtv.org S: Odd Fixes -F: drivers/media/usb/hdpvr +F: drivers/media/usb/hdpvr/ HWPOISON MEMORY FAILURE HANDLING M: Andi Kleen @@ -4581,7 +4581,7 @@ S: Supported W: http://www.openfabrics.org W: www.open-iscsi.org Q: http://patchwork.kernel.org/project/linux-rdma/list/ -F: drivers/infiniband/ulp/iser +F: drivers/infiniband/ulp/iser/ ISDN SUBSYSTEM M: Karsten Keil @@ -6142,7 +6142,7 @@ W: http://openrisc.net L: linux@lists.openrisc.net (moderated for non-subscribers) S: Maintained T: git git://openrisc.net/~jonas/linux -F: arch/openrisc +F: arch/openrisc/ OPENVSWITCH M: Jesse Gross @@ -6433,7 +6433,7 @@ M: Jamie Iles L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) T: git git://github.com/jamieiles/linux-2.6-ji.git S: Supported -F: arch/arm/mach-picoxcell +F: arch/arm/mach-picoxcell/ F: drivers/*/picoxcell* F: drivers/*/*/picoxcell* @@ -6706,7 +6706,7 @@ F: drivers/spi/spi-pxa2xx* F: drivers/usb/gadget/pxa2* F: include/sound/pxa2xx-lib.h F: sound/arm/pxa* -F: sound/soc/pxa +F: sound/soc/pxa/ MMP SUPPORT M: Eric Miao @@ -7159,7 +7159,7 @@ SAMSUNG AUDIO (ASoC) DRIVERS M: Sangbeom Kim L: alsa-devel@alsa-project.org (moderated for non-subscribers) S: Supported -F: sound/soc/samsung +F: sound/soc/samsung/ SAMSUNG FRAMEBUFFER DRIVER M: Jingoo Han @@ -7205,7 +7205,7 @@ SERIAL DRIVERS M: Greg Kroah-Hartman L: linux-serial@vger.kernel.org S: Maintained -F: drivers/tty/serial +F: drivers/tty/serial/ SYNOPSYS DESIGNWARE DMAC DRIVER M: Viresh Kumar @@ -7240,7 +7240,7 @@ TLG2300 VIDEO4LINUX-2 DRIVER M: Huang Shijie M: Hans Verkuil S: Odd Fixes -F: drivers/media/usb/tlg2300 +F: drivers/media/usb/tlg2300/ SC1200 WDT DRIVER M: Zwane Mwaikambo @@ -7512,7 +7512,7 @@ S: Odd fixes F: drivers/media/common/siano/ F: drivers/media/usb/siano/ F: drivers/media/usb/siano/ -F: drivers/media/mmc/siano +F: drivers/media/mmc/siano/ SH_VEU V4L2 MEM2MEM DRIVER M: Guennadi Liakhovetski @@ -7561,7 +7561,7 @@ L: davinci-linux-open-source@linux.davincidsp.com (moderated for non-subscribers T: git git://gitorious.org/linux-davinci/linux-davinci.git Q: http://patchwork.kernel.org/project/linux-davinci/list/ S: Supported -F: arch/arm/mach-davinci +F: arch/arm/mach-davinci/ F: drivers/i2c/busses/i2c-davinci.c TI DAVINCI SERIES MEDIA DRIVER @@ -7646,7 +7646,7 @@ SMIA AND SMIA++ IMAGE SENSOR DRIVER M: Sakari Ailus L: linux-media@vger.kernel.org S: Maintained -F: drivers/media/i2c/smiapp +F: drivers/media/i2c/smiapp/ F: include/media/smiapp.h F: drivers/media/i2c/smiapp-pll.c F: drivers/media/i2c/smiapp-pll.h @@ -9314,7 +9314,7 @@ M: Matthew Garrett L: platform-driver-x86@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git S: Maintained -F: drivers/platform/x86 +F: drivers/platform/x86/ X86 MCE INFRASTRUCTURE M: Tony Luck From 5ab58acc40ead5521cbafcd1cc7f35170ccceeee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Emilio=20L=C3=B3pez?= Date: Wed, 11 Sep 2013 14:23:51 -0700 Subject: [PATCH 182/303] lib/genalloc.c: correct dev_get_gen_pool documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The documentation mentions a "name" parameter, which does not exist. This commit removes such mention from the function documentation. Signed-off-by: Emilio López Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index c522facfa3e5..26cf20be72b7 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -524,7 +524,6 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, /** * dev_get_gen_pool - Obtain the gen_pool (if any) for a device * @dev: device to retrieve the gen_pool from - * @name: Optional name for the gen_pool, usually NULL * * Returns the gen_pool for the device if one is present, or NULL. */ From f2e1d2ac344bcee83f6496a641d37e86d55e33d4 Mon Sep 17 00:00:00 2001 From: Gu Zheng Date: Wed, 11 Sep 2013 14:23:52 -0700 Subject: [PATCH 183/303] lib/crc32: update the comments of crc32_{be,le}_generic() [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Gu Zheng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/crc32.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index 072fbd8234d5..410093dbe51c 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -131,11 +131,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) #endif /** - * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32 - * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for - * other uses, or the previous crc32 value if computing incrementally. - * @p: pointer to buffer over which CRC is run + * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II + * CRC32/CRC32C + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other + * uses, or the previous crc32/crc32c value if computing incrementally. + * @p: pointer to buffer over which CRC32/CRC32C is run * @len: length of buffer @p + * @tab: little-endian Ethernet table + * @polynomial: CRC32/CRC32c LE polynomial */ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], @@ -201,11 +204,13 @@ EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); /** - * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 + * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for * other uses, or the previous crc32 value if computing incrementally. - * @p: pointer to buffer over which CRC is run + * @p: pointer to buffer over which CRC32 is run * @len: length of buffer @p + * @tab: big-endian Ethernet table + * @polynomial: CRC32 BE polynomial */ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], From 1431574a1c4c669a0c198e4763627837416e4443 Mon Sep 17 00:00:00 2001 From: Alexandre Courbot Date: Wed, 11 Sep 2013 14:23:53 -0700 Subject: [PATCH 184/303] lib/decompressors: fix "no limit" output buffer length When decompressing into memory, the output buffer length is set to some arbitrarily high value (0x7fffffff) to indicate the output is, virtually, unlimited in size. The problem with this is that some platforms have their physical memory at high physical addresses (0x80000000 or more), and that the output buffer address and its "unlimited" length cannot be added without overflowing. An example of this can be found in inflate_fast(): /* next_out is the output buffer address */ out = strm->next_out - OFF; /* avail_out is the output buffer size. end will overflow if the output * address is >= 0x80000104 */ end = out + (strm->avail_out - 257); This has huge consequences on the performance of kernel decompression, since the following exit condition of inflate_fast() will be always true: } while (in < last && out < end); Indeed, "end" has overflowed and is now always lower than "out". As a result, inflate_fast() will return after processing one single byte of input data, and will thus need to be called an unreasonably high number of times. This probably went unnoticed because kernel decompression is fast enough even with this issue. Nonetheless, adjusting the output buffer length in such a way that the above pointer arithmetic never overflows results in a kernel decompression that is about 3 times faster on affected machines. Signed-off-by: Alexandre Courbot Tested-by: Jon Medhurst Cc: Stephen Warren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/decompress_inflate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index 19ff89e34eec..d619b28c456f 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -48,7 +48,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, out_len = 0x8000; /* 32 K */ out_buf = malloc(out_len); } else { - out_len = 0x7fffffff; /* no limit */ + out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */ } if (!out_buf) { error("Out of memory while allocating output buffer"); From d5e616fc1c1dd673c53b682877e2d35a2862263c Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:54 -0700 Subject: [PATCH 185/303] checkpatch: add a few more --fix corrections Suggest a few more single-line corrections. Remove DOS line endings Simplify removing trailing whitespace Remove global/static initializations to 0/NULL Convert pr_warning to pr_warn Add space after brace Convert binary constants to hex Remove whitespace after line continuation Use inline not __inline or __inline__ Use __printf and __scanf Use a single ; for statement terminations Convert __FUNCTION__ to __func__ Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 122 +++++++++++++++++++++++++++++------------- 1 file changed, 84 insertions(+), 38 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 2ee9eb750560..9163651edc50 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1845,15 +1845,17 @@ sub process { #trailing whitespace if ($line =~ /^\+.*\015/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; - ERROR("DOS_LINE_ENDINGS", - "DOS line endings\n" . $herevet); - + if (ERROR("DOS_LINE_ENDINGS", + "DOS line endings\n" . $herevet) && + $fix) { + $fixed[$linenr - 1] =~ s/[\s\015]+$//; + } } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; if (ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*?)\s+$/$1/; + $fixed[$linenr - 1] =~ s/\s+$//; } $rpt_cleaners = 1; @@ -2486,16 +2488,22 @@ sub process { } # check for global initialisers. - if ($line =~ /^.$Type\s*$Ident\s*(?:\s+$Modifier)*\s*=\s*(0|NULL|false)\s*;/) { - ERROR("GLOBAL_INITIALISERS", - "do not initialise globals to 0 or NULL\n" . - $herecurr); + if ($line =~ /^\+(\s*$Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/) { + if (ERROR("GLOBAL_INITIALISERS", + "do not initialise globals to 0 or NULL\n" . + $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; + } } # check for static initialisers. - if ($line =~ /\bstatic\s.*=\s*(0|NULL|false)\s*;/) { - ERROR("INITIALISED_STATIC", - "do not initialise statics to 0 or NULL\n" . - $herecurr); + if ($line =~ /^\+.*\bstatic\s.*=\s*(0|NULL|false)\s*;/) { + if (ERROR("INITIALISED_STATIC", + "do not initialise statics to 0 or NULL\n" . + $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + } } # check for static const char * arrays. @@ -2638,8 +2646,12 @@ sub process { } if ($line =~ /\bpr_warning\s*\(/) { - WARN("PREFER_PR_LEVEL", - "Prefer pr_warn(... to pr_warning(...\n" . $herecurr); + if (WARN("PREFER_PR_LEVEL", + "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ + s/\bpr_warning\b/pr_warn/; + } } if ($line =~ /\bdev_printk\s*\(\s*KERN_([A-Z]+)/) { @@ -3031,8 +3043,7 @@ sub process { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ - s/^(\+.*(?:do|\))){/$1 {/; + $fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/; } } @@ -3047,8 +3058,12 @@ sub process { # closing brace should have a space following it when it has anything # on the line if ($line =~ /}(?!(?:,|;|\)))\S/) { - ERROR("SPACING", - "space required after that close brace '}'\n" . $herecurr); + if (ERROR("SPACING", + "space required after that close brace '}'\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ + s/}((?!(?:,|;|\)))\S)/} $1/; + } } # check spacing on square brackets @@ -3271,8 +3286,13 @@ sub process { #gcc binary extension if ($var =~ /^$Binary$/) { - WARN("GCC_BINARY_CONSTANT", - "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr); + if (WARN("GCC_BINARY_CONSTANT", + "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) && + $fix) { + my $hexval = sprintf("0x%x", oct($var)); + $fixed[$linenr - 1] =~ + s/\b$var\b/$hexval/; + } } #CamelCase @@ -3292,9 +3312,12 @@ sub process { } #no spaces allowed after \ in define - if ($line=~/\#\s*define.*\\\s$/) { - WARN("WHITESPACE_AFTER_LINE_CONTINUATION", - "Whitepspace after \\ makes next lines useless\n" . $herecurr); + if ($line =~ /\#\s*define.*\\\s+$/) { + if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION", + "Whitespace after \\ makes next lines useless\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\s+$//; + } } #warn if is #included and is available (uses RAW line) @@ -3691,8 +3714,12 @@ sub process { # Check for __inline__ and __inline, prefer inline if ($line =~ /\b(__inline__|__inline)\b/) { - WARN("INLINE", - "plain inline is preferred over $1\n" . $herecurr); + if (WARN("INLINE", + "plain inline is preferred over $1\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/; + + } } # Check for __attribute__ packed, prefer __packed @@ -3709,14 +3736,21 @@ sub process { # Check for __attribute__ format(printf, prefer __printf if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { - WARN("PREFER_PRINTF", - "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); + if (WARN("PREFER_PRINTF", + "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; + + } } # Check for __attribute__ format(scanf, prefer __scanf if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\b/) { - WARN("PREFER_SCANF", - "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr); + if (WARN("PREFER_SCANF", + "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; + } } # check for sizeof(&) @@ -3727,8 +3761,11 @@ sub process { # check for sizeof without parenthesis if ($line =~ /\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/) { - WARN("SIZEOF_PARENTHESIS", - "sizeof $1 should be sizeof($1)\n" . $herecurr); + if (WARN("SIZEOF_PARENTHESIS", + "sizeof $1 should be sizeof($1)\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; + } } # check for line continuations in quoted strings with odd counts of " @@ -3747,8 +3784,11 @@ sub process { if ($line =~ /\bseq_printf\s*\(/) { my $fmt = get_quoted_string($line, $rawline); if ($fmt !~ /[^\\]\%/) { - WARN("PREFER_SEQ_PUTS", - "Prefer seq_puts to seq_printf\n" . $herecurr); + if (WARN("PREFER_SEQ_PUTS", + "Prefer seq_puts to seq_printf\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/; + } } } @@ -3879,8 +3919,11 @@ sub process { # check for multiple semicolons if ($line =~ /;\s*;\s*$/) { - WARN("ONE_SEMICOLON", - "Statements terminations use 1 semicolon\n" . $herecurr); + if (WARN("ONE_SEMICOLON", + "Statements terminations use 1 semicolon\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g; + } } # check for switch/default statements without a break; @@ -3898,9 +3941,12 @@ sub process { } # check for gcc specific __FUNCTION__ - if ($line =~ /__FUNCTION__/) { - WARN("USE_FUNC", - "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr); + if ($line =~ /\b__FUNCTION__\b/) { + if (WARN("USE_FUNC", + "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g; + } } # check for use of yield() From 7e781f67df436b67753a65436c0fef0a0ebf5043 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:55 -0700 Subject: [PATCH 186/303] checkpatch: check CamelCase by word, not by $Lval $Lval is a test for complete name (ie: foo->bar.Baz[1]) If any of this is CamelCase, then the current test uses the entire $Lval. This isn't optimal because it can emit messages with foo->bar.Baz and bar.Baz when Baz is a variable specified in an include file. So instead, break the $Lval into words and check each word for CamelCase uses. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9163651edc50..6b409b0ad457 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3302,11 +3302,15 @@ sub process { $var !~ /^(?:Clear|Set|TestClear|TestSet|)Page[A-Z]/ && #Ignore SI style variants like nS, mV and dB (ie: max_uV, regulator_min_uA_show) $var !~ /^(?:[a-z_]*?)_?[a-z][A-Z](?:_[a-z_]+)?$/) { - seed_camelcase_includes() if ($check); - if (!defined $camelcase{$var}) { - $camelcase{$var} = 1; - CHK("CAMELCASE", - "Avoid CamelCase: <$var>\n" . $herecurr); + while ($var =~ m{($Ident)}g) { + my $word = $1; + next if ($word !~ /[A-Z][a-z]|[a-z][A-Z]/); + seed_camelcase_includes() if ($check); + if (!defined $camelcase{$word}) { + $camelcase{$word} = 1; + CHK("CAMELCASE", + "Avoid CamelCase: <$word>\n" . $herecurr); + } } } } From d62a201f24cba74e2fbf9f6f7af86ff5f5e276fc Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 11 Sep 2013 14:23:56 -0700 Subject: [PATCH 187/303] checkpatch: enforce sane perl version I got a bug report from a couple of users who said checkpatch.pl was broken for them. It was erroring out on fairly random lines most commonly with messages like: Nested quantifiers in regex; marked by <--HERE in m/(\((?:[^\(\)]++ <-- HERE |(?-1))*\))/ at ./checkpatch.pl line 340. The bug reporter was running a version of perl 5.8 which was end-of-lifed in 2008: http://www.cpan.org/src/. Versions of perl this old are at _best_ quite untested. At worst, they are crusty and known to be completely broken. If folks have a system _that_ old, then we should have mercy on them and give them a half-decent error message rather than fail with nutty error messages. This patch enforces that checkpatch.pl is run with perl 5.10, which was end-of-lifed in 2009. The new --ignore-perl-version command-line switch will let folks override this if they want. Signed-off-by: Dave Hansen Cc: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 6b409b0ad457..c00e5108c0d2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -37,6 +37,8 @@ my @ignore = (); my $help = 0; my $configuration_file = ".checkpatch.conf"; my $max_line_length = 80; +my $ignore_perl_version = 0; +my $minimum_perl_version = 5.10.0; sub help { my ($exitcode) = @_; @@ -71,6 +73,8 @@ Options: ".EXPERIMENTAL-checkpatch-fixes" with potential errors corrected to the preferred checkpatch style + --ignore-perl-version override checking of perl version. expect + runtime errors. -h, --help, --version display this help and exit When FILE is - read standard input. @@ -123,6 +127,7 @@ GetOptions( 'mailback!' => \$mailback, 'summary-file!' => \$summary_file, 'fix!' => \$fix, + 'ignore-perl-version!' => \$ignore_perl_version, 'debug=s' => \%debug, 'test-only=s' => \$tst_only, 'h|help' => \$help, @@ -133,6 +138,13 @@ help(0) if ($help); my $exit = 0; +if ($^V && $^V lt $minimum_perl_version) { + printf "$P: requires at least perl version %vd\n", $minimum_perl_version; + if (!$ignore_perl_version) { + exit(1); + } +} + if ($#ARGV < 0) { print "$P: no input files\n"; exit(1); From 7e51f1979237e01bcd4e04e434c5da79151f08f8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:57 -0700 Subject: [PATCH 188/303] checkpatch: check for duplicate signatures Emit a warning when a signature is used more than once. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index c00e5108c0d2..7c79c91662c8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1544,6 +1544,7 @@ sub process { my %suppress_export; my $suppress_statement = 0; + my %signatures = (); # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. @@ -1793,6 +1794,17 @@ sub process { "email address '$email' might be better as '$suggested_email$comment'\n" . $herecurr); } } + +# Check for duplicate signatures + my $sig_nospace = $line; + $sig_nospace =~ s/\s//g; + $sig_nospace = lc($sig_nospace); + if (defined $signatures{$sig_nospace}) { + WARN("BAD_SIGN_OFF", + "Duplicate signature\n" . $herecurr); + } else { + $signatures{$sig_nospace} = 1; + } } # Check for wrappage within a valid hunk of the file From 70dc8a48357ce630d8a76887a9a36f0d34c8caf2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:58 -0700 Subject: [PATCH 189/303] checkpatch: warn when using extern with function prototypes in .h files Using the extern keyword on function prototypes is superfluous visual noise so suggest removing it. Using extern can cause unnecessary line wrapping at 80 columns and unnecessarily long multi-line function prototypes. Signed-off-by: Joe Perches Suggested-by: Hannes Frederic Sowa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7c79c91662c8..e2cb1f4621b7 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3878,6 +3878,16 @@ sub process { } } +# check for new externs in .h files. + if ($realfile =~ /\.h$/ && + $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { + if (WARN("AVOID_EXTERNS", + "extern prototypes should be avoided in .h files\n" . $herecurr) && + $fix) { + $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; + } + } + # check for new externs in .c files. if ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) From 61135e966367eda5056504ffd2f7518eaf77e25b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:59 -0700 Subject: [PATCH 190/303] checkpatch: fix networking kernel-doc block comment defect checkpatch can generate a false positive when inserting a new kernel-doc block and function above an existing kernel-doc block. Fix it by checking that the context line is also a newly inserted line. Signed-off-by: Joe Perches Reported-by: Darren Hart Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e2cb1f4621b7..9185883f5885 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2086,6 +2086,7 @@ sub process { if ($realfile =~ m@^(drivers/net/|net/)@ && $prevrawline =~ /^\+[ \t]*\/\*/ && #starting /* $prevrawline !~ /\*\/[ \t]*$/ && #no trailing */ + $rawline =~ /^\+/ && #line is new $rawline !~ /^\+[ \t]*\*/) { #no leading * WARN("NETWORKING_BLOCK_COMMENT_STYLE", "networking block comments start with * on subsequent lines\n" . $hereprev); From 91bfe4843dff4426ca3a0dd1dab8454c1534022d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:23:59 -0700 Subject: [PATCH 191/303] checkpatch: add --types option to report only specific message types Add a --types convenience option to show only specific message types. Combined with the --fix option, this can produce specific suggested formatting patches to files. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 56 +++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9185883f5885..3ba2db637384 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -31,8 +31,10 @@ my $show_types = 0; my $fix = 0; my $root; my %debug; -my %ignore_type = (); my %camelcase = (); +my %use_type = (); +my @use = (); +my %ignore_type = (); my @ignore = (); my $help = 0; my $configuration_file = ".checkpatch.conf"; @@ -56,6 +58,7 @@ Options: --terse one line per report -f, --file treat FILE as regular source file --subjective, --strict enable more subjective tests + --types TYPE(,TYPE2...) show only these comma separated message types --ignore TYPE(,TYPE2...) ignore various comma separated message types --max-line-length=n set the maximum line length, if exceeded, warn --show-types show the message "types" in the output @@ -120,6 +123,7 @@ GetOptions( 'subjective!' => \$check, 'strict!' => \$check, 'ignore=s' => \@ignore, + 'types=s' => \@use, 'show-types!' => \$show_types, 'max-line-length=i' => \$max_line_length, 'root=s' => \$root, @@ -150,19 +154,38 @@ if ($#ARGV < 0) { exit(1); } -@ignore = split(/,/, join(',',@ignore)); -foreach my $word (@ignore) { - $word =~ s/\s*\n?$//g; - $word =~ s/^\s*//g; - $word =~ s/\s+/ /g; - $word =~ tr/[a-z]/[A-Z]/; +sub hash_save_array_words { + my ($hashRef, $arrayRef) = @_; - next if ($word =~ m/^\s*#/); - next if ($word =~ m/^\s*$/); + my @array = split(/,/, join(',', @$arrayRef)); + foreach my $word (@array) { + $word =~ s/\s*\n?$//g; + $word =~ s/^\s*//g; + $word =~ s/\s+/ /g; + $word =~ tr/[a-z]/[A-Z]/; - $ignore_type{$word}++; + next if ($word =~ m/^\s*#/); + next if ($word =~ m/^\s*$/); + + $hashRef->{$word}++; + } } +sub hash_show_words { + my ($hashRef, $prefix) = @_; + + if ($quiet == 0 && keys $hashRef) { + print "NOTE: $prefix message types:"; + foreach my $word (sort keys $hashRef) { + print " $word"; + } + print "\n\n"; + } +} + +hash_save_array_words(\%ignore_type, \@ignore); +hash_save_array_words(\%use_type, \@use); + my $dbg_values = 0; my $dbg_possible = 0; my $dbg_type = 0; @@ -1367,7 +1390,9 @@ sub possible { my $prefix = ''; sub show_type { - return !defined $ignore_type{$_[0]}; + return defined $use_type{$_[0]} if (scalar keys %use_type > 0); + + return !defined $ignore_type{$_[0]}; } sub report { @@ -4190,13 +4215,8 @@ sub process { } } - if ($quiet == 0 && keys %ignore_type) { - print "NOTE: Ignored message types:"; - foreach my $ignore (sort keys %ignore_type) { - print " $ignore"; - } - print "\n\n"; - } + hash_show_words(\%use_type, "Used"); + hash_show_words(\%ignore_type, "Ignored"); if ($clean == 0 && $fix && "@rawlines" ne "@fixed") { my $newfile = $filename . ".EXPERIMENTAL-checkpatch-fixes"; From f95a7e6a462ed1338bd576ccb557ff86772a0776 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:24:00 -0700 Subject: [PATCH 192/303] checkpatch: ignore #define TRACE_ macros The tracing subsystem uses slightly odd #defines to set path/directory locations for include files. These #defines can cause false positives for the complex macro tests so add exclusions for these specific #defines (TRACE_SYSTEM, TRACE_INCLUDE_FILE, TRACE_INCLUDE_PATH). Signed-off-by: Joe Perches Cc: Sarah Sharp Cc: Li Zefan Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 3ba2db637384..db7778a8f414 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3451,7 +3451,8 @@ sub process { $dstat !~ /^for\s*$Constant$/ && # for (...) $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() $dstat !~ /^do\s*{/ && # do {... - $dstat !~ /^\({/) # ({... + $dstat !~ /^\({/ && # ({... + $ctx !~ /^.\s*#\s*define\s+TRACE_(?:SYSTEM|INCLUDE_FILE|INCLUDE_PATH)\b/) { $ctx =~ s/\n*$//; my $herectx = $here . "\n"; From b34c648bb33ca143b98851fd7fe7250f1875c463 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:24:01 -0700 Subject: [PATCH 193/303] checkpatch: better --fix of SPACING errors. Previous attempt at fixing SPACING errors could make a hash of several defects. This patch should make --fix be a lot better at correcting these defects. Trim left and right sides of these defects appropriately instead of a somewhat random attempt at it. Trim left spaces from any following bit of the modified line when only a single space is required around an operator. Signed-off-by: Joe Perches Cc: Phil Carmody Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 64 ++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 22 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index db7778a8f414..e53df2b086b2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1472,7 +1472,23 @@ sub check_absolute_file { sub trim { my ($string) = @_; - $string =~ s/(^\s+|\s+$)//g; + $string =~ s/^\s+|\s+$//g; + + return $string; +} + +sub ltrim { + my ($string) = @_; + + $string =~ s/^\s+//; + + return $string; +} + +sub rtrim { + my ($string) = @_; + + $string =~ s/\s+$//; return $string; } @@ -2821,6 +2837,7 @@ sub process { $off = 0; my $blank = copy_spacing($opline); + my $last_after = -1; for (my $n = 0; $n < $#elements; $n += 2) { @@ -2886,7 +2903,7 @@ sub process { $cc !~ /^\\/ && $cc !~ /^;/) { if (ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; $line_fixed = 1; } } @@ -2901,11 +2918,11 @@ sub process { if ($ctx =~ /Wx.|.xW/) { if (ERROR("SPACING", "spaces prohibited around that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); - $line_fixed = 1; + $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); if (defined $fix_elements[$n + 2]) { $fix_elements[$n + 2] =~ s/^\s+//; } + $line_fixed = 1; } } @@ -2914,8 +2931,9 @@ sub process { if ($ctx !~ /.x[WEC]/ && $cc !~ /^}/) { if (ERROR("SPACING", "space required after that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " "; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; $line_fixed = 1; + $last_after = $n; } } @@ -2932,8 +2950,10 @@ sub process { if ($ctx !~ /[WEBC]x./ && $ca !~ /(?:\)|!|~|\*|-|\&|\||\+\+|\-\-|\{)$/) { if (ERROR("SPACING", "space required before that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]); - $line_fixed = 1; + if ($n != $last_after + 2) { + $good = $fix_elements[$n] . " " . ltrim($fix_elements[$n + 1]); + $line_fixed = 1; + } } } if ($op eq '*' && $cc =~/\s*$Modifier\b/) { @@ -2942,12 +2962,11 @@ sub process { } elsif ($ctx =~ /.xW/) { if (ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); - $line_fixed = 1; + $good = $fix_elements[$n] . rtrim($fix_elements[$n + 1]); if (defined $fix_elements[$n + 2]) { $fix_elements[$n + 2] =~ s/^\s+//; } + $line_fixed = 1; } } @@ -2956,8 +2975,7 @@ sub process { if ($ctx !~ /[WEOBC]x[^W]/ && $ctx !~ /[^W]x[WOBEC]/) { if (ERROR("SPACING", "space required one side of that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]) . " "; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]) . " "; $line_fixed = 1; } } @@ -2965,20 +2983,18 @@ sub process { ($ctx =~ /Wx./ && $cc =~ /^;/)) { if (ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); + $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); $line_fixed = 1; } } if ($ctx =~ /ExW/) { if (ERROR("SPACING", "space prohibited after that '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); - $line_fixed = 1; + $good = $fix_elements[$n] . trim($fix_elements[$n + 1]); if (defined $fix_elements[$n + 2]) { $fix_elements[$n + 2] =~ s/^\s+//; } + $line_fixed = 1; } } @@ -2992,8 +3008,10 @@ sub process { if ($ctx =~ /Wx[^WCE]|[^WCE]xW/) { if (ERROR("SPACING", "need consistent spacing around '$op' $at\n" . $hereptr)) { - $fixed_line =~ s/\s+$//; - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + $good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + if (defined $fix_elements[$n + 2]) { + $fix_elements[$n + 2] =~ s/^\s+//; + } $line_fixed = 1; } } @@ -3004,7 +3022,7 @@ sub process { if ($ctx =~ /Wx./) { if (ERROR("SPACING", "space prohibited before that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . trim($fix_elements[$n + 1]); + $good = rtrim($fix_elements[$n]) . trim($fix_elements[$n + 1]); $line_fixed = 1; } } @@ -3031,8 +3049,10 @@ sub process { if ($ok == 0) { if (ERROR("SPACING", "spaces required around that '$op' $at\n" . $hereptr)) { - $good = trim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; - $good = $fix_elements[$n] . " " . trim($fix_elements[$n + 1]) . " "; + $good = rtrim($fix_elements[$n]) . " " . trim($fix_elements[$n + 1]) . " "; + if (defined $fix_elements[$n + 2]) { + $fix_elements[$n + 2] =~ s/^\s+//; + } $line_fixed = 1; } } From 1b5539b1ffbdcf7113eebea7f37141d4468d9070 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:24:03 -0700 Subject: [PATCH 194/303] checkpatch: reduce runtime/cpu time used There are some cases where checkpatch can take a long time to complete. Reduce the likelihood of this long run-time by adding a new test for lines with and without comments and eliminating checks on lines with only comments. This reduces the number of "ctx_statement_block" calls, and also the number of tests of $stat, which is now undefined for these blank lines. One test in particular, the "check for switch/default statements without a break", could take an extremely long time to parse as it tries to skip interleaving comments within the ctx_statement_block/$stat and that could be done multiple times unnecessarily. A small test case taken from cfg80211.h before this patch would take 1000's of seconds to run, now it's just a couple seconds. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e53df2b086b2..55277a8e1527 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1678,6 +1678,8 @@ sub process { $linenr = 0; foreach my $line (@lines) { $linenr++; + my $sline = $line; #copy of $line + $sline =~ s/$;/ /g; #with comments as spaces my $rawline = $rawlines[$linenr - 1]; @@ -2194,7 +2196,7 @@ sub process { $realline_next); #print "LINE<$line>\n"; if ($linenr >= $suppress_statement && - $realcnt && $line =~ /.\s*\S/) { + $realcnt && $sline =~ /.\s*\S/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0); $stat =~ s/\n./\n /g; From 58cb3cf66cc6330910316abb1dc7a7aa78917a27 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:24:04 -0700 Subject: [PATCH 195/303] checkpatch: fix perl version 5.12 and earlier incompatibility A previous patch ("checkpatch: add --types option to report only specific message types") uses a perl syntax introduced in perl version 5.14. Use the backward compatible perl syntax instead. Signed-off-by: Joe Perches Reported-by: Julia Lawall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 55277a8e1527..9ba4fc44112a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -174,9 +174,9 @@ sub hash_save_array_words { sub hash_show_words { my ($hashRef, $prefix) = @_; - if ($quiet == 0 && keys $hashRef) { + if ($quiet == 0 && keys %$hashRef) { print "NOTE: $prefix message types:"; - foreach my $word (sort keys $hashRef) { + foreach my $word (sort keys %$hashRef) { print " $word"; } print "\n\n"; From 8716de383b82f16d920513138f1691e40ef5a9e3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:24:05 -0700 Subject: [PATCH 196/303] checkpatch: add test for positional misuse of section specifiers like __initdata As discussed recently on the arm [1] and lm-sensors [2] lists, it is possible to use section markers on variables in a way which gcc doesn't understand (or at least not the way the developer intended): static struct __initdata samsung_pll_clock exynos4_plls[nr_plls] = { does NOT put exynos4_plls in the .initdata section. The __initdata marker can be virtually anywhere on the line, EXCEPT right after "struct". The preferred location is before the "=" sign if there is one, or before the trailing ";" otherwise. [1] http://permalink.gmane.org/gmane.linux.ports.arm.kernel/258149 [2] http://lists.lm-sensors.org/pipermail/lm-sensors/2013-August/039836.html So, update checkpatch to find these misuses and report an error when it's immediately after struct or union, and a warning when it's otherwise not immediately before the ; or =. A similar patch was suggested by Andi Kleen https://lkml.org/lkml/2013/8/5/648 Signed-off-by: Joe Perches Suggested-by: Jean Delvare Tested-by: Guenter Roeck Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 47 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 9ba4fc44112a..47016c304c84 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -242,6 +242,8 @@ our $Sparse = qr{ __rcu }x; +our $InitAttribute = qr{__(?:mem|cpu|dev|net_|)(?:initdata|initconst|init\b)}; + # Notes to $Attribute: # We need \b after 'init' otherwise 'initconst' will cause a false positive in a check our $Attribute = qr{ @@ -262,7 +264,7 @@ our $Attribute = qr{ __deprecated| __read_mostly| __kprobes| - __(?:mem|cpu|dev|)(?:initdata|initconst|init\b)| + $InitAttribute| ____cacheline_aligned| ____cacheline_aligned_in_smp| ____cacheline_internodealigned_in_smp| @@ -292,6 +294,7 @@ our $Operators = qr{ }x; our $NonptrType; +our $NonptrTypeWithAttr; our $Type; our $Declare; @@ -354,6 +357,12 @@ our @typeList = ( qr{${Ident}_handler}, qr{${Ident}_handler_fn}, ); +our @typeListWithAttr = ( + @typeList, + qr{struct\s+$InitAttribute\s+$Ident}, + qr{union\s+$InitAttribute\s+$Ident}, +); + our @modifierList = ( qr{fastcall}, ); @@ -367,6 +376,7 @@ our $allowed_asm_includes = qr{(?x: sub build_types { my $mods = "(?x: \n" . join("|\n ", @modifierList) . "\n)"; my $all = "(?x: \n" . join("|\n ", @typeList) . "\n)"; + my $allWithAttr = "(?x: \n" . join("|\n ", @typeListWithAttr) . "\n)"; $Modifier = qr{(?:$Attribute|$Sparse|$mods)}; $NonptrType = qr{ (?:$Modifier\s+|const\s+)* @@ -377,6 +387,15 @@ sub build_types { ) (?:\s+$Modifier|\s+const)* }x; + $NonptrTypeWithAttr = qr{ + (?:$Modifier\s+|const\s+)* + (?: + (?:typeof|__typeof__)\s*\([^\)]*\)| + (?:$typeTypedefs\b)| + (?:${allWithAttr}\b) + ) + (?:\s+$Modifier|\s+const)* + }x; $Type = qr{ $NonptrType (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? @@ -3706,6 +3725,32 @@ sub process { } } +sub string_find_replace { + my ($string, $find, $replace) = @_; + + $string =~ s/$find/$replace/g; + + return $string; +} + +# check for bad placement of section $InitAttribute (e.g.: __initdata) + if ($line =~ /(\b$InitAttribute\b)/) { + my $attr = $1; + if ($line =~ /^\+\s*static\s+(?:const\s+)?(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*[=;]/) { + my $ptr = $1; + my $var = $2; + if ((($ptr =~ /\b(union|struct)\s+$attr\b/ && + ERROR("MISPLACED_INIT", + "$attr should be placed after $var\n" . $herecurr)) || + ($ptr !~ /\b(union|struct)\s+$attr\b/ && + WARN("MISPLACED_INIT", + "$attr should be placed after $var\n" . $herecurr))) && + $fix) { + $fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; + } + } + } + # prefer usleep_range over udelay if ($line =~ /\budelay\s*\(\s*(\d+)\s*\)/) { # ignore udelay's < 10, however From 91cf5ab60ff82ecf4550a596867787c1e360dd3f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Sep 2013 14:24:06 -0700 Subject: [PATCH 197/303] epoll: add a reschedule point in ep_free() ep_free() might iterate on a huge set of epitems and hold cpu too long. Add two cond_resched() in order to yield cpu to other tasks. This is safe as we only hold mutexes in this function. Signed-off-by: Eric Dumazet Cc: Al Viro Cc: Theodore Ts'o Acked-by: Eric Wong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 293f86741ddb..473e09da7d02 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -740,6 +740,7 @@ static void ep_free(struct eventpoll *ep) epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); + cond_resched(); } /* @@ -754,6 +755,7 @@ static void ep_free(struct eventpoll *ep) while ((rbp = rb_first(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); + cond_resched(); } mutex_unlock(&ep->mtx); From 3d267f24d4c7bcc829ce9daa92e41c3f390c95dd Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 11 Sep 2013 14:24:07 -0700 Subject: [PATCH 198/303] firmware/dmi_scan: drop obsolete comment This comment predates the introduction of early_ioremap. Since then the missing calls to dmi_iounmap have been added by Ingo and Yinghai in commits 0d64484f7ea1 ("x86: fix DMI ioremap leak") and 3212bff370c2 ("x86: left over fix for leak of early_ioremp in dmi_scan") . That was over 5 years ago so it is about time to drop this now misleading comment. Signed-off-by: Jean Delvare Cc: Ingo Molnar Cc: Yinghai Lu Cc: Joe Perches Cc: Ben Hutchings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/dmi_scan.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 232fa8fce26a..9e50cb997a42 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -504,11 +504,6 @@ void __init dmi_scan_machine(void) } } else { - /* - * no iounmap() for that ioremap(); it would be a no-op, but - * it's so early in setup that sucker gets confused into doing - * what it shouldn't if we actually call it. - */ p = dmi_ioremap(0xF0000, 0x10000); if (p == NULL) goto error; From 02d9c47f1bf2304d6482e1e69e00c06791d86908 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 11 Sep 2013 14:24:08 -0700 Subject: [PATCH 199/303] firmware/dmi_scan: fix most checkpatch errors and warnings Fix all errors and trivial warnings reported by checkpatch for file drivers/firmware/dmi_scan.c. Signed-off-by: Jean Delvare Cc: Joe Perches Cc: Ben Hutchings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/dmi_scan.c | 47 +++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 9e50cb997a42..5a5ca664f3e7 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -63,7 +63,7 @@ static char * __init dmi_string(const struct dmi_header *dm, u8 s) if (str != NULL) strcpy(str, bp); else - printk(KERN_ERR "dmi_string: cannot allocate %Zu bytes.\n", len); + pr_err("dmi_string: cannot allocate %Zu bytes.\n", len); return str; } @@ -140,9 +140,10 @@ int dmi_available; /* * Save a DMI string */ -static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int string) +static void __init dmi_save_ident(const struct dmi_header *dm, int slot, + int string) { - const char *d = (const char*) dm; + const char *d = (const char *) dm; char *p; if (dmi_ident[slot]) @@ -155,9 +156,10 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int str dmi_ident[slot] = p; } -static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int index) +static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, + int index) { - const u8 *d = (u8*) dm + index; + const u8 *d = (u8 *) dm + index; char *s; int is_ff = 1, is_00 = 1, i; @@ -188,12 +190,13 @@ static void __init dmi_save_uuid(const struct dmi_header *dm, int slot, int inde else sprintf(s, "%pUB", d); - dmi_ident[slot] = s; + dmi_ident[slot] = s; } -static void __init dmi_save_type(const struct dmi_header *dm, int slot, int index) +static void __init dmi_save_type(const struct dmi_header *dm, int slot, + int index) { - const u8 *d = (u8*) dm + index; + const u8 *d = (u8 *) dm + index; char *s; if (dmi_ident[slot]) @@ -217,7 +220,7 @@ static void __init dmi_save_one_device(int type, const char *name) dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1); if (!dev) { - printk(KERN_ERR "dmi_save_one_device: out of memory.\n"); + pr_err("dmi_save_one_device: out of memory.\n"); return; } @@ -256,8 +259,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) dev = dmi_alloc(sizeof(*dev)); if (!dev) { - printk(KERN_ERR - "dmi_save_oem_strings_devices: out of memory.\n"); + pr_err("dmi_save_oem_strings_devices: out of memory.\n"); break; } @@ -272,11 +274,11 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) static void __init dmi_save_ipmi_device(const struct dmi_header *dm) { struct dmi_device *dev; - void * data; + void *data; data = dmi_alloc(dm->length); if (data == NULL) { - printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n"); + pr_err("dmi_save_ipmi_device: out of memory.\n"); return; } @@ -284,7 +286,7 @@ static void __init dmi_save_ipmi_device(const struct dmi_header *dm) dev = dmi_alloc(sizeof(*dev)); if (!dev) { - printk(KERN_ERR "dmi_save_ipmi_device: out of memory.\n"); + pr_err("dmi_save_ipmi_device: out of memory.\n"); return; } @@ -302,7 +304,7 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus, onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1); if (!onboard_dev) { - printk(KERN_ERR "dmi_save_dev_onboard: out of memory.\n"); + pr_err("dmi_save_dev_onboard: out of memory.\n"); return; } onboard_dev->instance = instance; @@ -320,7 +322,7 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus, static void __init dmi_save_extended_devices(const struct dmi_header *dm) { - const u8 *d = (u8*) dm + 5; + const u8 *d = (u8 *) dm + 5; /* Skip disabled device */ if ((*d & 0x80) == 0) @@ -338,7 +340,7 @@ static void __init dmi_save_extended_devices(const struct dmi_header *dm) */ static void __init dmi_decode(const struct dmi_header *dm, void *dummy) { - switch(dm->type) { + switch (dm->type) { case 0: /* BIOS Information */ dmi_save_ident(dm, DMI_BIOS_VENDOR, 4); dmi_save_ident(dm, DMI_BIOS_VERSION, 5); @@ -502,8 +504,7 @@ void __init dmi_scan_machine(void) dmi_available = 1; goto out; } - } - else { + } else { p = dmi_ioremap(0xF0000, 0x10000); if (p == NULL) goto error; @@ -528,7 +529,7 @@ void __init dmi_scan_machine(void) dmi_iounmap(p, 0x10000); } error: - printk(KERN_INFO "DMI not present or invalid.\n"); + pr_info("DMI not present or invalid.\n"); out: dmi_initialized = 1; } @@ -664,7 +665,7 @@ int dmi_name_in_serial(const char *str) /** * dmi_name_in_vendors - Check if string is in the DMI system or board vendor name - * @str: Case sensitive Name + * @str: Case sensitive Name */ int dmi_name_in_vendors(const char *str) { @@ -691,13 +692,13 @@ EXPORT_SYMBOL(dmi_name_in_vendors); * A new search is initiated by passing %NULL as the @from argument. * If @from is not %NULL, searches continue from next device. */ -const struct dmi_device * dmi_find_device(int type, const char *name, +const struct dmi_device *dmi_find_device(int type, const char *name, const struct dmi_device *from) { const struct list_head *head = from ? &from->list : &dmi_devices; struct list_head *d; - for(d = head->next; d != &dmi_devices; d = d->next) { + for (d = head->next; d != &dmi_devices; d = d->next) { const struct dmi_device *dev = list_entry(d, struct dmi_device, list); From ffbbb96dd7570b9aafd426cd77a7ee03d224cabf Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 11 Sep 2013 14:24:09 -0700 Subject: [PATCH 200/303] firmware/dmi_scan: constify strings Add const to all DMI string pointers where this is possible. This fixes a checkpatch warning. Signed-off-by: Jean Delvare Cc: Joe Perches Cc: Ben Hutchings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/dmi_scan.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 5a5ca664f3e7..9a094bb44e3d 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -14,7 +14,7 @@ * of and an antecedent to, SMBIOS, which stands for System * Management BIOS. See further: http://www.dmtf.org/standards */ -static char dmi_empty_string[] = " "; +static const char dmi_empty_string[] = " "; static u16 __initdata dmi_ver; /* @@ -49,7 +49,7 @@ static const char * __init dmi_string_nosave(const struct dmi_header *dm, u8 s) return ""; } -static char * __init dmi_string(const struct dmi_header *dm, u8 s) +static const char * __init dmi_string(const struct dmi_header *dm, u8 s) { const char *bp = dmi_string_nosave(dm, s); char *str; @@ -133,7 +133,7 @@ static int __init dmi_checksum(const u8 *buf, u8 len) return sum == 0; } -static char *dmi_ident[DMI_STRING_MAX]; +static const char *dmi_ident[DMI_STRING_MAX]; static LIST_HEAD(dmi_devices); int dmi_available; @@ -144,7 +144,7 @@ static void __init dmi_save_ident(const struct dmi_header *dm, int slot, int string) { const char *d = (const char *) dm; - char *p; + const char *p; if (dmi_ident[slot]) return; @@ -252,7 +252,7 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) struct dmi_device *dev; for (i = 1; i <= count; i++) { - char *devname = dmi_string(dm, i); + const char *devname = dmi_string(dm, i); if (devname == dmi_empty_string) continue; From ae79744975cb0b3b9c469fe1a05db37d2943c863 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Wed, 11 Sep 2013 14:24:10 -0700 Subject: [PATCH 201/303] firmware/dmi_scan: drop OOM messages As reported by Joe Perches: OOM messages generally aren't useful. dmi_alloc is either a trivial front-end to kzalloc, and kzalloc already does a dump_stack() when OOM, or for x86, dmi_alloc uses extend_brk which BUGs when unsuccessful. So we can remove all 6 such log messages in the dmi_scan driver, to shrink the binary size (by 528 bytes on x86_64.) Signed-off-by: Jean Delvare Reported-by: Joe Perches Cc: Ben Hutchings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/firmware/dmi_scan.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c index 9a094bb44e3d..fa0affb699b4 100644 --- a/drivers/firmware/dmi_scan.c +++ b/drivers/firmware/dmi_scan.c @@ -62,8 +62,6 @@ static const char * __init dmi_string(const struct dmi_header *dm, u8 s) str = dmi_alloc(len); if (str != NULL) strcpy(str, bp); - else - pr_err("dmi_string: cannot allocate %Zu bytes.\n", len); return str; } @@ -219,10 +217,8 @@ static void __init dmi_save_one_device(int type, const char *name) return; dev = dmi_alloc(sizeof(*dev) + strlen(name) + 1); - if (!dev) { - pr_err("dmi_save_one_device: out of memory.\n"); + if (!dev) return; - } dev->type = type; strcpy((char *)(dev + 1), name); @@ -258,10 +254,8 @@ static void __init dmi_save_oem_strings_devices(const struct dmi_header *dm) continue; dev = dmi_alloc(sizeof(*dev)); - if (!dev) { - pr_err("dmi_save_oem_strings_devices: out of memory.\n"); + if (!dev) break; - } dev->type = DMI_DEV_TYPE_OEM_STRING; dev->name = devname; @@ -277,18 +271,14 @@ static void __init dmi_save_ipmi_device(const struct dmi_header *dm) void *data; data = dmi_alloc(dm->length); - if (data == NULL) { - pr_err("dmi_save_ipmi_device: out of memory.\n"); + if (data == NULL) return; - } memcpy(data, dm, dm->length); dev = dmi_alloc(sizeof(*dev)); - if (!dev) { - pr_err("dmi_save_ipmi_device: out of memory.\n"); + if (!dev) return; - } dev->type = DMI_DEV_TYPE_IPMI; dev->name = "IPMI controller"; @@ -303,10 +293,9 @@ static void __init dmi_save_dev_onboard(int instance, int segment, int bus, struct dmi_dev_onboard *onboard_dev; onboard_dev = dmi_alloc(sizeof(*onboard_dev) + strlen(name) + 1); - if (!onboard_dev) { - pr_err("dmi_save_dev_onboard: out of memory.\n"); + if (!onboard_dev) return; - } + onboard_dev->instance = instance; onboard_dev->segment = segment; onboard_dev->bus = bus; From c802d64a356b5cf349121ac4c5e005f037ce548d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:11 -0700 Subject: [PATCH 202/303] kprobes: unify insn caches The current kpropes insn caches allocate memory areas for insn slots with module_alloc(). The assumption is that the kernel image and module area are both within the same +/- 2GB memory area. This however is not true for s390 where the kernel image resides within the first 2GB (DMA memory area), but the module area is far away in the vmalloc area, usually somewhere close below the 4TB area. For new pc relative instructions s390 needs insn slots that are within +/- 2GB of each area. That way we can patch displacements of pc-relative instructions within the insn slots just like x86 and powerpc. The module area works already with the normal insn slot allocator, however there is currently no way to get insn slots that are within the first 2GB on s390 (aka DMA area). Therefore this patch set modifies the kprobes insn slot cache code in order to allow to specify a custom allocator for the insn slot cache pages. In addition architecure can now have private insn slot caches withhout the need to modify common code. Patch 1 unifies and simplifies the current insn and optinsn caches implementation. This is a preparation which allows to add more insn caches in a simple way. Patch 2 adds the possibility to specify a custom allocator. Patch 3 makes s390 use the new insn slot mechanisms and adds support for pc-relative instructions with long displacements. This patch (of 3): The two insn caches (insn, and optinsn) each have an own mutex and alloc/free functions (get_[opt]insn_slot() / free_[opt]insn_slot()). Since there is the need for yet another insn cache which satifies dma allocations on s390, unify and simplify the current implementation: - Move the per insn cache mutex into struct kprobe_insn_cache. - Move the alloc/free functions to kprobe.h so they are simply wrappers for the generic __get_insn_slot/__free_insn_slot functions. The implementation is done with a DEFINE_INSN_CACHE_OPS() macro which provides the alloc/free functions for each cache if needed. - move the struct kprobe_insn_cache to kprobe.h which allows to generate architecture specific insn slot caches outside of the core kprobes code. Signed-off-by: Heiko Carstens Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 32 +++++++++++++++--- kernel/kprobes.c | 75 ++++++++++++----------------------------- 2 files changed, 49 insertions(+), 58 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ca1d27a0d6a6..077f65321b5e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -264,10 +264,34 @@ extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); -extern kprobe_opcode_t *get_insn_slot(void); -extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); extern void kprobes_inc_nmissed_count(struct kprobe *p); +struct kprobe_insn_cache { + struct mutex mutex; + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; +}; + +extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c); +extern void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty); + +#define DEFINE_INSN_CACHE_OPS(__name) \ +extern struct kprobe_insn_cache kprobe_##__name##_slots; \ + \ +static inline kprobe_opcode_t *get_##__name##_slot(void) \ +{ \ + return __get_insn_slot(&kprobe_##__name##_slots); \ +} \ + \ +static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\ +{ \ + __free_insn_slot(&kprobe_##__name##_slots, slot, dirty); \ +} \ + +DEFINE_INSN_CACHE_OPS(insn); + #ifdef CONFIG_OPTPROBES /* * Internal structure for direct jump optimized probe @@ -287,13 +311,13 @@ extern void arch_optimize_kprobes(struct list_head *oplist); extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); -extern kprobe_opcode_t *get_optinsn_slot(void); -extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr); extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); +DEFINE_INSN_CACHE_OPS(optinsn); + #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6e33498d665c..9e4912dc5559 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -121,12 +121,6 @@ struct kprobe_insn_page { (offsetof(struct kprobe_insn_page, slot_used) + \ (sizeof(char) * (slots))) -struct kprobe_insn_cache { - struct list_head pages; /* list of kprobe_insn_page */ - size_t insn_size; /* size of instruction slot */ - int nr_garbage; -}; - static int slots_per_page(struct kprobe_insn_cache *c) { return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); @@ -138,8 +132,8 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ -static struct kprobe_insn_cache kprobe_insn_slots = { +struct kprobe_insn_cache kprobe_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -150,10 +144,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; + kprobe_opcode_t *slot = NULL; + mutex_lock(&c->mutex); retry: list_for_each_entry(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { @@ -162,7 +158,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * c->insn_size); + slot = kip->insns + (i * c->insn_size); + goto out; } } /* kip->nused is broken. Fix it. */ @@ -178,7 +175,7 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) /* All out of space. Need to allocate a new page. */ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) - return NULL; + goto out; /* * Use module_alloc so this page is within +/- 2GB of where the @@ -188,7 +185,7 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) kip->insns = module_alloc(PAGE_SIZE); if (!kip->insns) { kfree(kip); - return NULL; + goto out; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); @@ -196,19 +193,10 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) kip->nused = 1; kip->ngarbage = 0; list_add(&kip->list, &c->pages); - return kip->insns; -} - - -kprobe_opcode_t __kprobes *get_insn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(&kprobe_insn_slots); - mutex_unlock(&kprobe_insn_mutex); - - return ret; + slot = kip->insns; +out: + mutex_unlock(&c->mutex); + return slot; } /* Return 1 if all garbages are collected, otherwise 0. */ @@ -255,11 +243,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + mutex_lock(&c->mutex); list_for_each_entry(kip, &c->pages, list) { long idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); @@ -272,45 +261,23 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, collect_garbage_slots(c); } else collect_one_slot(kip, idx); - return; + goto out; } } /* Could not free this slot. */ WARN_ON(1); +out: + mutex_unlock(&c->mutex); } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_insn_mutex); - __free_insn_slot(&kprobe_insn_slots, slot, dirty); - mutex_unlock(&kprobe_insn_mutex); -} #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ -static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ -static struct kprobe_insn_cache kprobe_optinsn_slots = { +struct kprobe_insn_cache kprobe_optinsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, }; -/* Get a slot for optimized_kprobe buffer */ -kprobe_opcode_t __kprobes *get_optinsn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_optinsn_mutex); - ret = __get_insn_slot(&kprobe_optinsn_slots); - mutex_unlock(&kprobe_optinsn_mutex); - - return ret; -} - -void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_optinsn_mutex); - __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); - mutex_unlock(&kprobe_optinsn_mutex); -} #endif #endif From af96397de8600232effbff43dc8b4ca20ddc02b1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:13 -0700 Subject: [PATCH 203/303] kprobes: allow to specify custom allocator for insn caches The current two insn slot caches both use module_alloc/module_free to allocate and free insn slot cache pages. For s390 this is not sufficient since there is the need to allocate insn slots that are either within the vmalloc module area or within dma memory. Therefore add a mechanism which allows to specify an own allocator for an own insn slot cache. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 077f65321b5e..925eaf28fca9 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -268,6 +268,8 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p); struct kprobe_insn_cache { struct mutex mutex; + void *(*alloc)(void); /* allocate insn page */ + void (*free)(void *); /* free insn page */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e4912dc5559..a0d367a49122 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ + struct kprobe_insn_cache *cache; int nused; int ngarbage; char slot_used[]; @@ -132,8 +133,20 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static void *alloc_insn_page(void) +{ + return module_alloc(PAGE_SIZE); +} + +static void free_insn_page(void *page) +{ + module_free(NULL, page); +} + struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -182,7 +195,7 @@ kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) * kernel image and loaded module images reside. This is required * so x86_64 can correctly handle the %rip-relative fixups. */ - kip->insns = module_alloc(PAGE_SIZE); + kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); goto out; @@ -192,6 +205,7 @@ kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + kip->cache = c; list_add(&kip->list, &c->pages); slot = kip->insns; out: @@ -213,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) */ if (!list_is_singular(&kip->list)) { list_del(&kip->list); - module_free(NULL, kip->insns); + kip->cache->free(kip->insns); kfree(kip); } return 1; @@ -274,6 +288,8 @@ out: /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, From 63c40436a1afc837f3ace6b5a39c547bc91c20bc Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:14 -0700 Subject: [PATCH 204/303] s390/kprobes: add support for pc-relative long displacement instructions With the general-instruction extension facility (z10) a couple of instructions with a pc-relative long displacement were introduced. The kprobes support for these instructions however was never implemented. In result, if anybody ever put a probe on any of these instructions the result would have been random behaviour after the instruction got executed within the insn slot. So lets add the missing handling for these instructions. Since all of the new instructions have 32 bit signed displacement the easiest solution is to allocate an insn slot that is within the same 2GB area like the original instruction and patch the displacement field. Signed-off-by: Heiko Carstens Reviewed-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/kprobes.h | 4 +- arch/s390/kernel/kprobes.c | 144 ++++++++++++++++++++++++++++++-- 2 files changed, 140 insertions(+), 8 deletions(-) diff --git a/arch/s390/include/asm/kprobes.h b/arch/s390/include/asm/kprobes.h index dcf6948a875c..4176dfe0fba1 100644 --- a/arch/s390/include/asm/kprobes.h +++ b/arch/s390/include/asm/kprobes.h @@ -31,6 +31,8 @@ #include #include +#define __ARCH_WANT_KPROBES_INSN_SLOT + struct pt_regs; struct kprobe; @@ -57,7 +59,7 @@ typedef u16 kprobe_opcode_t; /* Architecture specific copy of original instruction */ struct arch_specific_insn { /* copy of original instruction */ - kprobe_opcode_t insn[MAX_INSN_SIZE]; + kprobe_opcode_t *insn; }; struct prev_kprobe { diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index adbbe7f1cb0d..0ce9fb245034 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -37,6 +37,26 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; +DEFINE_INSN_CACHE_OPS(dmainsn); + +static void *alloc_dmainsn_page(void) +{ + return (void *)__get_free_page(GFP_KERNEL | GFP_DMA); +} + +static void free_dmainsn_page(void *page) +{ + free_page((unsigned long)page); +} + +struct kprobe_insn_cache kprobe_dmainsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex), + .alloc = alloc_dmainsn_page, + .free = free_dmainsn_page, + .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages), + .insn_size = MAX_INSN_SIZE, +}; + static int __kprobes is_prohibited_opcode(kprobe_opcode_t *insn) { switch (insn[0] >> 8) { @@ -100,9 +120,8 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn) fixup |= FIXUP_RETURN_REGISTER; break; case 0xc0: - if ((insn[0] & 0x0f) == 0x00 || /* larl */ - (insn[0] & 0x0f) == 0x05) /* brasl */ - fixup |= FIXUP_RETURN_REGISTER; + if ((insn[0] & 0x0f) == 0x05) /* brasl */ + fixup |= FIXUP_RETURN_REGISTER; break; case 0xeb: switch (insn[2] & 0xff) { @@ -134,18 +153,128 @@ static int __kprobes get_fixup_type(kprobe_opcode_t *insn) return fixup; } +static int __kprobes is_insn_relative_long(kprobe_opcode_t *insn) +{ + /* Check if we have a RIL-b or RIL-c format instruction which + * we need to modify in order to avoid instruction emulation. */ + switch (insn[0] >> 8) { + case 0xc0: + if ((insn[0] & 0x0f) == 0x00) /* larl */ + return true; + break; + case 0xc4: + switch (insn[0] & 0x0f) { + case 0x02: /* llhrl */ + case 0x04: /* lghrl */ + case 0x05: /* lhrl */ + case 0x06: /* llghrl */ + case 0x07: /* sthrl */ + case 0x08: /* lgrl */ + case 0x0b: /* stgrl */ + case 0x0c: /* lgfrl */ + case 0x0d: /* lrl */ + case 0x0e: /* llgfrl */ + case 0x0f: /* strl */ + return true; + } + break; + case 0xc6: + switch (insn[0] & 0x0f) { + case 0x00: /* exrl */ + case 0x02: /* pfdrl */ + case 0x04: /* cghrl */ + case 0x05: /* chrl */ + case 0x06: /* clghrl */ + case 0x07: /* clhrl */ + case 0x08: /* cgrl */ + case 0x0a: /* clgrl */ + case 0x0c: /* cgfrl */ + case 0x0d: /* crl */ + case 0x0e: /* clgfrl */ + case 0x0f: /* clrl */ + return true; + } + break; + } + return false; +} + +static void __kprobes copy_instruction(struct kprobe *p) +{ + s64 disp, new_disp; + u64 addr, new_addr; + + memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2); + if (!is_insn_relative_long(p->ainsn.insn)) + return; + /* + * For pc-relative instructions in RIL-b or RIL-c format patch the + * RI2 displacement field. We have already made sure that the insn + * slot for the patched instruction is within the same 2GB area + * as the original instruction (either kernel image or module area). + * Therefore the new displacement will always fit. + */ + disp = *(s32 *)&p->ainsn.insn[1]; + addr = (u64)(unsigned long)p->addr; + new_addr = (u64)(unsigned long)p->ainsn.insn; + new_disp = ((addr + (disp * 2)) - new_addr) / 2; + *(s32 *)&p->ainsn.insn[1] = new_disp; +} + +static inline int is_kernel_addr(void *addr) +{ + return addr < (void *)_end; +} + +static inline int is_module_addr(void *addr) +{ +#ifdef CONFIG_64BIT + BUILD_BUG_ON(MODULES_LEN > (1UL << 31)); + if (addr < (void *)MODULES_VADDR) + return 0; + if (addr > (void *)MODULES_END) + return 0; +#endif + return 1; +} + +static int __kprobes s390_get_insn_slot(struct kprobe *p) +{ + /* + * Get an insn slot that is within the same 2GB area like the original + * instruction. That way instructions with a 32bit signed displacement + * field can be patched and executed within the insn slot. + */ + p->ainsn.insn = NULL; + if (is_kernel_addr(p->addr)) + p->ainsn.insn = get_dmainsn_slot(); + if (is_module_addr(p->addr)) + p->ainsn.insn = get_insn_slot(); + return p->ainsn.insn ? 0 : -ENOMEM; +} + +static void __kprobes s390_free_insn_slot(struct kprobe *p) +{ + if (!p->ainsn.insn) + return; + if (is_kernel_addr(p->addr)) + free_dmainsn_slot(p->ainsn.insn, 0); + else + free_insn_slot(p->ainsn.insn, 0); + p->ainsn.insn = NULL; +} + int __kprobes arch_prepare_kprobe(struct kprobe *p) { if ((unsigned long) p->addr & 0x01) return -EINVAL; - /* Make sure the probe isn't going on a difficult instruction */ if (is_prohibited_opcode(p->addr)) return -EINVAL; - + if (s390_get_insn_slot(p)) + return -ENOMEM; p->opcode = *p->addr; - memcpy(p->ainsn.insn, p->addr, ((p->opcode >> 14) + 3) & -2); - + copy_instruction(p); return 0; } @@ -186,6 +315,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { + s390_free_insn_slot(p); } static void __kprobes enable_singlestep(struct kprobe_ctlblk *kcb, From ae259925bc8e4880e861b29d166000240dde93de Mon Sep 17 00:00:00 2001 From: Alexander Holler Date: Wed, 11 Sep 2013 14:24:15 -0700 Subject: [PATCH 205/303] drivers/rtc/rtc-hid-sensor-time.c: add module alias to let the module load automatically In order to get the module automatically loaded by hotplug mechanisms a MODULE_DEVICE_TABLE is needed. Therefore add one. This makes it also possible to use a module name other than HID-SENSOR-2000a0 which isn't very descriptive in kernel messages. Signed-off-by: Alexander Holler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-hid-sensor-time.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index 7273b0139e5c..b5a2874b15ef 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -23,10 +23,6 @@ #include #include -/* Format: HID-SENSOR-usage_id_in_hex */ -/* Usage ID from spec for Time: 0x2000A0 */ -#define DRIVER_NAME "HID-SENSOR-2000a0" /* must be lowercase */ - enum hid_time_channel { CHANNEL_SCAN_INDEX_YEAR, CHANNEL_SCAN_INDEX_MONTH, @@ -300,9 +296,19 @@ static int hid_time_remove(struct platform_device *pdev) return 0; } +static struct platform_device_id hid_time_ids[] = { + { + /* Format: HID-SENSOR-usage_id_in_hex_lowercase */ + .name = "HID-SENSOR-2000a0", + }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(platform, hid_time_ids); + static struct platform_driver hid_time_platform_driver = { + .id_table = hid_time_ids, .driver = { - .name = DRIVER_NAME, + .name = KBUILD_MODNAME, .owner = THIS_MODULE, }, .probe = hid_time_probe, From 4540bae9cd8c03ec4a26eacad468d2f2f2e85cbb Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Wed, 11 Sep 2013 14:24:16 -0700 Subject: [PATCH 206/303] drivers/rtc/rtc-pcf2127.c: remove empty function The 'remove' function is empty and does not do anything. Delete it. Signed-off-by: Sachin Kamat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-pcf2127.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/rtc/rtc-pcf2127.c b/drivers/rtc/rtc-pcf2127.c index 205b9f7da1b8..1ee514a3972c 100644 --- a/drivers/rtc/rtc-pcf2127.c +++ b/drivers/rtc/rtc-pcf2127.c @@ -203,11 +203,6 @@ static int pcf2127_probe(struct i2c_client *client, return 0; } -static int pcf2127_remove(struct i2c_client *client) -{ - return 0; -} - static const struct i2c_device_id pcf2127_id[] = { { "pcf2127", 0 }, { } @@ -229,7 +224,6 @@ static struct i2c_driver pcf2127_driver = { .of_match_table = of_match_ptr(pcf2127_of_match), }, .probe = pcf2127_probe, - .remove = pcf2127_remove, .id_table = pcf2127_id, }; From 453b4c6db59f7f6411a0b5eb58389a1fa129cc9a Mon Sep 17 00:00:00 2001 From: Jonas Jensen Date: Wed, 11 Sep 2013 14:24:17 -0700 Subject: [PATCH 207/303] rtc: add MOXA ART RTC driver Add RTC driver for MOXA ART SoCs. Signed-off-by: Jonas Jensen Reviewed-by: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../bindings/rtc/moxa,moxart-rtc.txt | 17 + drivers/rtc/Kconfig | 9 + drivers/rtc/Makefile | 1 + drivers/rtc/rtc-moxart.c | 330 ++++++++++++++++++ 4 files changed, 357 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt create mode 100644 drivers/rtc/rtc-moxart.c diff --git a/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt new file mode 100644 index 000000000000..c9d3ac1477fe --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/moxa,moxart-rtc.txt @@ -0,0 +1,17 @@ +MOXA ART real-time clock + +Required properties: + +- compatible : Should be "moxa,moxart-rtc" +- gpio-rtc-sclk : RTC sclk gpio, with zero flags +- gpio-rtc-data : RTC data gpio, with zero flags +- gpio-rtc-reset : RTC reset gpio, with zero flags + +Example: + + rtc: rtc { + compatible = "moxa,moxart-rtc"; + gpio-rtc-sclk = <&gpio 5 0>; + gpio-rtc-data = <&gpio 6 0>; + gpio-rtc-reset = <&gpio 7 0>; + }; diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 9e3498bf302b..9654aa3c05cb 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -1249,6 +1249,15 @@ config RTC_DRV_SIRFSOC Say "yes" here to support the real time clock on SiRF SOC chips. This driver can also be built as a module called rtc-sirfsoc. +config RTC_DRV_MOXART + tristate "MOXA ART RTC" + help + If you say yes here you get support for the MOXA ART + RTC module. + + This driver can also be built as a module. If so, the module + will be called rtc-moxart + comment "HID Sensor RTC drivers" config RTC_DRV_HID_SENSOR_TIME diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index d3b4488f48f2..2dff3d2009b5 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -130,3 +130,4 @@ obj-$(CONFIG_RTC_DRV_WM831X) += rtc-wm831x.o obj-$(CONFIG_RTC_DRV_WM8350) += rtc-wm8350.o obj-$(CONFIG_RTC_DRV_X1205) += rtc-x1205.o obj-$(CONFIG_RTC_DRV_SIRFSOC) += rtc-sirfsoc.o +obj-$(CONFIG_RTC_DRV_MOXART) += rtc-moxart.o diff --git a/drivers/rtc/rtc-moxart.c b/drivers/rtc/rtc-moxart.c new file mode 100644 index 000000000000..c29dee0946e6 --- /dev/null +++ b/drivers/rtc/rtc-moxart.c @@ -0,0 +1,330 @@ +/* + * MOXA ART RTC driver. + * + * Copyright (C) 2013 Jonas Jensen + * + * Jonas Jensen + * + * Based on code from + * Moxa Technology Co., Ltd. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define GPIO_RTC_RESERVED 0x0C +#define GPIO_RTC_DATA_SET 0x10 +#define GPIO_RTC_DATA_CLEAR 0x14 +#define GPIO_RTC_PIN_PULL_ENABLE 0x18 +#define GPIO_RTC_PIN_PULL_TYPE 0x1C +#define GPIO_RTC_INT_ENABLE 0x20 +#define GPIO_RTC_INT_RAW_STATE 0x24 +#define GPIO_RTC_INT_MASKED_STATE 0x28 +#define GPIO_RTC_INT_MASK 0x2C +#define GPIO_RTC_INT_CLEAR 0x30 +#define GPIO_RTC_INT_TRIGGER 0x34 +#define GPIO_RTC_INT_BOTH 0x38 +#define GPIO_RTC_INT_RISE_NEG 0x3C +#define GPIO_RTC_BOUNCE_ENABLE 0x40 +#define GPIO_RTC_BOUNCE_PRE_SCALE 0x44 +#define GPIO_RTC_PROTECT_W 0x8E +#define GPIO_RTC_PROTECT_R 0x8F +#define GPIO_RTC_YEAR_W 0x8C +#define GPIO_RTC_YEAR_R 0x8D +#define GPIO_RTC_DAY_W 0x8A +#define GPIO_RTC_DAY_R 0x8B +#define GPIO_RTC_MONTH_W 0x88 +#define GPIO_RTC_MONTH_R 0x89 +#define GPIO_RTC_DATE_W 0x86 +#define GPIO_RTC_DATE_R 0x87 +#define GPIO_RTC_HOURS_W 0x84 +#define GPIO_RTC_HOURS_R 0x85 +#define GPIO_RTC_MINUTES_W 0x82 +#define GPIO_RTC_MINUTES_R 0x83 +#define GPIO_RTC_SECONDS_W 0x80 +#define GPIO_RTC_SECONDS_R 0x81 +#define GPIO_RTC_DELAY_TIME 8 + +struct moxart_rtc { + struct rtc_device *rtc; + spinlock_t rtc_lock; + int gpio_data, gpio_sclk, gpio_reset; +}; + +static int day_of_year[12] = { 0, 31, 59, 90, 120, 151, 181, + 212, 243, 273, 304, 334 }; + +static void moxart_rtc_write_byte(struct device *dev, u8 data) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + int i; + + for (i = 0; i < 8; i++, data >>= 1) { + gpio_set_value(moxart_rtc->gpio_sclk, 0); + gpio_set_value(moxart_rtc->gpio_data, ((data & 1) == 1)); + udelay(GPIO_RTC_DELAY_TIME); + gpio_set_value(moxart_rtc->gpio_sclk, 1); + udelay(GPIO_RTC_DELAY_TIME); + } +} + +static u8 moxart_rtc_read_byte(struct device *dev) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + int i; + u8 data = 0; + + for (i = 0; i < 8; i++) { + gpio_set_value(moxart_rtc->gpio_sclk, 0); + udelay(GPIO_RTC_DELAY_TIME); + gpio_set_value(moxart_rtc->gpio_sclk, 1); + udelay(GPIO_RTC_DELAY_TIME); + if (gpio_get_value(moxart_rtc->gpio_data)) + data |= (1 << i); + udelay(GPIO_RTC_DELAY_TIME); + } + return data; +} + +static u8 moxart_rtc_read_register(struct device *dev, u8 cmd) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + u8 data; + unsigned long flags; + + local_irq_save(flags); + + gpio_direction_output(moxart_rtc->gpio_data, 0); + gpio_set_value(moxart_rtc->gpio_reset, 1); + udelay(GPIO_RTC_DELAY_TIME); + moxart_rtc_write_byte(dev, cmd); + gpio_direction_input(moxart_rtc->gpio_data); + udelay(GPIO_RTC_DELAY_TIME); + data = moxart_rtc_read_byte(dev); + gpio_set_value(moxart_rtc->gpio_sclk, 0); + gpio_set_value(moxart_rtc->gpio_reset, 0); + udelay(GPIO_RTC_DELAY_TIME); + + local_irq_restore(flags); + + return data; +} + +static void moxart_rtc_write_register(struct device *dev, u8 cmd, u8 data) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + unsigned long flags; + + local_irq_save(flags); + + gpio_direction_output(moxart_rtc->gpio_data, 0); + gpio_set_value(moxart_rtc->gpio_reset, 1); + udelay(GPIO_RTC_DELAY_TIME); + moxart_rtc_write_byte(dev, cmd); + moxart_rtc_write_byte(dev, data); + gpio_set_value(moxart_rtc->gpio_sclk, 0); + gpio_set_value(moxart_rtc->gpio_reset, 0); + udelay(GPIO_RTC_DELAY_TIME); + + local_irq_restore(flags); +} + +static int moxart_rtc_set_time(struct device *dev, struct rtc_time *tm) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + + spin_lock_irq(&moxart_rtc->rtc_lock); + + moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0); + moxart_rtc_write_register(dev, GPIO_RTC_YEAR_W, + (((tm->tm_year - 100) / 10) << 4) | + ((tm->tm_year - 100) % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_MONTH_W, + (((tm->tm_mon + 1) / 10) << 4) | + ((tm->tm_mon + 1) % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_DATE_W, + ((tm->tm_mday / 10) << 4) | + (tm->tm_mday % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_HOURS_W, + ((tm->tm_hour / 10) << 4) | + (tm->tm_hour % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_MINUTES_W, + ((tm->tm_min / 10) << 4) | + (tm->tm_min % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_SECONDS_W, + ((tm->tm_sec / 10) << 4) | + (tm->tm_sec % 10)); + + moxart_rtc_write_register(dev, GPIO_RTC_PROTECT_W, 0x80); + + spin_unlock_irq(&moxart_rtc->rtc_lock); + + dev_dbg(dev, "%s: success tm_year=%d tm_mon=%d\n" + "tm_mday=%d tm_hour=%d tm_min=%d tm_sec=%d\n", + __func__, tm->tm_year, tm->tm_mon, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); + + return 0; +} + +static int moxart_rtc_read_time(struct device *dev, struct rtc_time *tm) +{ + struct moxart_rtc *moxart_rtc = dev_get_drvdata(dev); + unsigned char v; + + spin_lock_irq(&moxart_rtc->rtc_lock); + + v = moxart_rtc_read_register(dev, GPIO_RTC_SECONDS_R); + tm->tm_sec = (((v & 0x70) >> 4) * 10) + (v & 0x0F); + + v = moxart_rtc_read_register(dev, GPIO_RTC_MINUTES_R); + tm->tm_min = (((v & 0x70) >> 4) * 10) + (v & 0x0F); + + v = moxart_rtc_read_register(dev, GPIO_RTC_HOURS_R); + if (v & 0x80) { /* 12-hour mode */ + tm->tm_hour = (((v & 0x10) >> 4) * 10) + (v & 0x0F); + if (v & 0x20) { /* PM mode */ + tm->tm_hour += 12; + if (tm->tm_hour >= 24) + tm->tm_hour = 0; + } + } else { /* 24-hour mode */ + tm->tm_hour = (((v & 0x30) >> 4) * 10) + (v & 0x0F); + } + + v = moxart_rtc_read_register(dev, GPIO_RTC_DATE_R); + tm->tm_mday = (((v & 0x30) >> 4) * 10) + (v & 0x0F); + + v = moxart_rtc_read_register(dev, GPIO_RTC_MONTH_R); + tm->tm_mon = (((v & 0x10) >> 4) * 10) + (v & 0x0F); + tm->tm_mon--; + + v = moxart_rtc_read_register(dev, GPIO_RTC_YEAR_R); + tm->tm_year = (((v & 0xF0) >> 4) * 10) + (v & 0x0F); + tm->tm_year += 100; + if (tm->tm_year <= 69) + tm->tm_year += 100; + + v = moxart_rtc_read_register(dev, GPIO_RTC_DAY_R); + tm->tm_wday = (v & 0x0f) - 1; + tm->tm_yday = day_of_year[tm->tm_mon]; + tm->tm_yday += (tm->tm_mday - 1); + if (tm->tm_mon >= 2) { + if (!(tm->tm_year % 4) && (tm->tm_year % 100)) + tm->tm_yday++; + } + + tm->tm_isdst = 0; + + spin_unlock_irq(&moxart_rtc->rtc_lock); + + return 0; +} + +static const struct rtc_class_ops moxart_rtc_ops = { + .read_time = moxart_rtc_read_time, + .set_time = moxart_rtc_set_time, +}; + +static int moxart_rtc_probe(struct platform_device *pdev) +{ + struct moxart_rtc *moxart_rtc; + int ret = 0; + + moxart_rtc = devm_kzalloc(&pdev->dev, sizeof(*moxart_rtc), GFP_KERNEL); + if (!moxart_rtc) { + dev_err(&pdev->dev, "devm_kzalloc failed\n"); + return -ENOMEM; + } + + moxart_rtc->gpio_data = of_get_named_gpio(pdev->dev.of_node, + "gpio-rtc-data", 0); + if (!gpio_is_valid(moxart_rtc->gpio_data)) { + dev_err(&pdev->dev, "invalid gpio (data): %d\n", + moxart_rtc->gpio_data); + return moxart_rtc->gpio_data; + } + + moxart_rtc->gpio_sclk = of_get_named_gpio(pdev->dev.of_node, + "gpio-rtc-sclk", 0); + if (!gpio_is_valid(moxart_rtc->gpio_sclk)) { + dev_err(&pdev->dev, "invalid gpio (sclk): %d\n", + moxart_rtc->gpio_sclk); + return moxart_rtc->gpio_sclk; + } + + moxart_rtc->gpio_reset = of_get_named_gpio(pdev->dev.of_node, + "gpio-rtc-reset", 0); + if (!gpio_is_valid(moxart_rtc->gpio_reset)) { + dev_err(&pdev->dev, "invalid gpio (reset): %d\n", + moxart_rtc->gpio_reset); + return moxart_rtc->gpio_reset; + } + + spin_lock_init(&moxart_rtc->rtc_lock); + platform_set_drvdata(pdev, moxart_rtc); + + ret = devm_gpio_request(&pdev->dev, moxart_rtc->gpio_data, "rtc_data"); + if (ret) { + dev_err(&pdev->dev, "can't get rtc_data gpio\n"); + return ret; + } + + ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_sclk, + GPIOF_DIR_OUT, "rtc_sclk"); + if (ret) { + dev_err(&pdev->dev, "can't get rtc_sclk gpio\n"); + return ret; + } + + ret = devm_gpio_request_one(&pdev->dev, moxart_rtc->gpio_reset, + GPIOF_DIR_OUT, "rtc_reset"); + if (ret) { + dev_err(&pdev->dev, "can't get rtc_reset gpio\n"); + return ret; + } + + moxart_rtc->rtc = devm_rtc_device_register(&pdev->dev, pdev->name, + &moxart_rtc_ops, + THIS_MODULE); + if (IS_ERR(moxart_rtc->rtc)) { + dev_err(&pdev->dev, "devm_rtc_device_register failed\n"); + return PTR_ERR(moxart_rtc->rtc); + } + + return 0; +} + +static const struct of_device_id moxart_rtc_match[] = { + { .compatible = "moxa,moxart-rtc" }, + { }, +}; + +static struct platform_driver moxart_rtc_driver = { + .probe = moxart_rtc_probe, + .driver = { + .name = "moxart-rtc", + .owner = THIS_MODULE, + .of_match_table = moxart_rtc_match, + }, +}; +module_platform_driver(moxart_rtc_driver); + +MODULE_DESCRIPTION("MOXART RTC driver"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jonas Jensen "); From 8af750e3f5ca21eaa5595a96a4cf5eaa996deed4 Mon Sep 17 00:00:00 2001 From: Hebbar Gururaja Date: Wed, 11 Sep 2013 14:24:18 -0700 Subject: [PATCH 208/303] drivers/rtc/rtc-omap.c: add rtc wakeup support to alarm events On some platforms (like AM33xx), a special register (RTC_IRQWAKEEN) is available to enable Alarm Wakeup feature. This register needs to be properly handled for the rtcwake to work properly. Platforms using such IP should set "ti,am3352-rtc" in rtc device dt compatibility node. Signed-off-by: Hebbar Gururaja Acked-by: Kevin Hilman Acked-by: Sekhar Nori Cc: Grant Likely Cc: Rob Herring Cc: Rob Landley Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../devicetree/bindings/rtc/rtc-omap.txt | 6 +- drivers/rtc/rtc-omap.c | 60 ++++++++++++++++--- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/Documentation/devicetree/bindings/rtc/rtc-omap.txt b/Documentation/devicetree/bindings/rtc/rtc-omap.txt index b47aa415c820..5a0f02d34d95 100644 --- a/Documentation/devicetree/bindings/rtc/rtc-omap.txt +++ b/Documentation/devicetree/bindings/rtc/rtc-omap.txt @@ -1,7 +1,11 @@ TI Real Time Clock Required properties: -- compatible: "ti,da830-rtc" +- compatible: + - "ti,da830-rtc" - for RTC IP used similar to that on DA8xx SoC family. + - "ti,am3352-rtc" - for RTC IP used similar to that on AM335x SoC family. + This RTC IP has special WAKE-EN Register to enable + Wakeup generation for event Alarm. - reg: Address range of rtc register set - interrupts: rtc timer, alarm interrupts in order - interrupt-parent: phandle for the interrupt controller diff --git a/drivers/rtc/rtc-omap.c b/drivers/rtc/rtc-omap.c index c6ffbaec32a4..c7d97ee59327 100644 --- a/drivers/rtc/rtc-omap.c +++ b/drivers/rtc/rtc-omap.c @@ -70,6 +70,8 @@ #define OMAP_RTC_KICK0_REG 0x6c #define OMAP_RTC_KICK1_REG 0x70 +#define OMAP_RTC_IRQWAKEEN 0x7c + /* OMAP_RTC_CTRL_REG bit fields: */ #define OMAP_RTC_CTRL_SPLIT (1<<7) #define OMAP_RTC_CTRL_DISABLE (1<<6) @@ -94,12 +96,21 @@ #define OMAP_RTC_INTERRUPTS_IT_ALARM (1<<3) #define OMAP_RTC_INTERRUPTS_IT_TIMER (1<<2) +/* OMAP_RTC_IRQWAKEEN bit fields: */ +#define OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN (1<<1) + /* OMAP_RTC_KICKER values */ #define KICK0_VALUE 0x83e70b13 #define KICK1_VALUE 0x95a4f1e0 #define OMAP_RTC_HAS_KICKER 0x1 +/* + * Few RTC IP revisions has special WAKE-EN Register to enable Wakeup + * generation for event Alarm. + */ +#define OMAP_RTC_HAS_IRQWAKEEN 0x2 + static void __iomem *rtc_base; #define rtc_read(addr) readb(rtc_base + (addr)) @@ -299,12 +310,18 @@ static struct rtc_class_ops omap_rtc_ops = { static int omap_rtc_alarm; static int omap_rtc_timer; -#define OMAP_RTC_DATA_DA830_IDX 1 +#define OMAP_RTC_DATA_AM3352_IDX 1 +#define OMAP_RTC_DATA_DA830_IDX 2 static struct platform_device_id omap_rtc_devtype[] = { { .name = DRIVER_NAME, - }, { + }, + [OMAP_RTC_DATA_AM3352_IDX] = { + .name = "am3352-rtc", + .driver_data = OMAP_RTC_HAS_KICKER | OMAP_RTC_HAS_IRQWAKEEN, + }, + [OMAP_RTC_DATA_DA830_IDX] = { .name = "da830-rtc", .driver_data = OMAP_RTC_HAS_KICKER, }, @@ -316,6 +333,9 @@ static const struct of_device_id omap_rtc_of_match[] = { { .compatible = "ti,da830-rtc", .data = &omap_rtc_devtype[OMAP_RTC_DATA_DA830_IDX], }, + { .compatible = "ti,am3352-rtc", + .data = &omap_rtc_devtype[OMAP_RTC_DATA_AM3352_IDX], + }, {}, }; MODULE_DEVICE_TABLE(of, omap_rtc_of_match); @@ -464,16 +484,28 @@ static u8 irqstat; static int omap_rtc_suspend(struct device *dev) { + u8 irqwake_stat; + struct platform_device *pdev = to_platform_device(dev); + const struct platform_device_id *id_entry = + platform_get_device_id(pdev); + irqstat = rtc_read(OMAP_RTC_INTERRUPTS_REG); /* FIXME the RTC alarm is not currently acting as a wakeup event - * source, and in fact this enable() call is just saving a flag - * that's never used... + * source on some platforms, and in fact this enable() call is just + * saving a flag that's never used... */ - if (device_may_wakeup(dev)) + if (device_may_wakeup(dev)) { enable_irq_wake(omap_rtc_alarm); - else + + if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) { + irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN); + irqwake_stat |= OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; + rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN); + } + } else { rtc_write(0, OMAP_RTC_INTERRUPTS_REG); + } /* Disable the clock/module */ pm_runtime_put_sync(dev); @@ -483,13 +515,25 @@ static int omap_rtc_suspend(struct device *dev) static int omap_rtc_resume(struct device *dev) { + u8 irqwake_stat; + struct platform_device *pdev = to_platform_device(dev); + const struct platform_device_id *id_entry = + platform_get_device_id(pdev); + /* Enable the clock/module so that we can access the registers */ pm_runtime_get_sync(dev); - if (device_may_wakeup(dev)) + if (device_may_wakeup(dev)) { disable_irq_wake(omap_rtc_alarm); - else + + if (id_entry->driver_data & OMAP_RTC_HAS_IRQWAKEEN) { + irqwake_stat = rtc_read(OMAP_RTC_IRQWAKEEN); + irqwake_stat &= ~OMAP_RTC_IRQWAKEEN_ALARM_WAKEEN; + rtc_write(irqwake_stat, OMAP_RTC_IRQWAKEEN); + } + } else { rtc_write(irqstat, OMAP_RTC_INTERRUPTS_REG); + } return 0; } #endif From 666a584d3a765a914642f80deef7a33fb309df5d Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Wed, 11 Sep 2013 14:24:19 -0700 Subject: [PATCH 209/303] drivers/rtc/rtc-palmas.c: support for backup battery charging Palmas series device like TPS65913, TPS80036 supports the backup battery for powering the RTC when no other energy source is available. The backup battery is optional, connected to the VBACKUP pin, and can be nonrechargeable or rechargeable. The rechargeable battery can be charged from the system supply using the backup battery charger. Add support for enabling charging of this backup battery. Also add the DT binding document and the new properties to have this support. Signed-off-by: Laxman Dewangan Reviewed-by: Felipe Balbi Acked-by: Kumar Gala Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../devicetree/bindings/rtc/rtc-palmas.txt | 33 +++++++++++++++++ drivers/rtc/rtc-palmas.c | 35 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 Documentation/devicetree/bindings/rtc/rtc-palmas.txt diff --git a/Documentation/devicetree/bindings/rtc/rtc-palmas.txt b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt new file mode 100644 index 000000000000..adbccc0a51e1 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/rtc-palmas.txt @@ -0,0 +1,33 @@ +Palmas RTC controller bindings + +Required properties: +- compatible: + - "ti,palmas-rtc" for palma series of the RTC controller +- interrupt-parent: Parent interrupt device, must be handle of palmas node. +- interrupts: Interrupt number of RTC submodule on device. + +Optional properties: + +- ti,backup-battery-chargeable: The Palmas series device like TPS65913 or + TPS80036 supports the backup battery for powering the RTC when main + battery is removed or in very low power state. The backup battery + can be chargeable or non-chargeable. This flag will tells whether + battery is chargeable or not. If charging battery then driver can + enable the charging. +- ti,backup-battery-charge-high-current: Enable high current charging in + backup battery. Device supports the < 100mA and > 100mA charging. + The high current will be > 100mA. Absence of this property will + charge battery to lower current i.e. < 100mA. + +Example: + palmas: tps65913@58 { + ... + palmas_rtc: rtc { + compatible = "ti,palmas-rtc"; + interrupt-parent = <&palmas>; + interrupts = <8 0>; + ti,backup-battery-chargeable; + ti,backup-battery-charge-high-current; + }; + ... + }; diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c index a1fecc8d97fc..fffb7d3449d7 100644 --- a/drivers/rtc/rtc-palmas.c +++ b/drivers/rtc/rtc-palmas.c @@ -238,6 +238,15 @@ static int palmas_rtc_probe(struct platform_device *pdev) struct palmas *palmas = dev_get_drvdata(pdev->dev.parent); struct palmas_rtc *palmas_rtc = NULL; int ret; + bool enable_bb_charging = false; + bool high_bb_charging; + + if (pdev->dev.of_node) { + enable_bb_charging = of_property_read_bool(pdev->dev.of_node, + "ti,backup-battery-chargeable"); + high_bb_charging = of_property_read_bool(pdev->dev.of_node, + "ti,backup-battery-charge-high-current"); + } palmas_rtc = devm_kzalloc(&pdev->dev, sizeof(struct palmas_rtc), GFP_KERNEL); @@ -254,6 +263,32 @@ static int palmas_rtc_probe(struct platform_device *pdev) palmas_rtc->dev = &pdev->dev; platform_set_drvdata(pdev, palmas_rtc); + if (enable_bb_charging) { + unsigned reg = PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG; + + if (high_bb_charging) + reg = 0; + + ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE, + PALMAS_BACKUP_BATTERY_CTRL, + PALMAS_BACKUP_BATTERY_CTRL_BBS_BBC_LOW_ICHRG, reg); + if (ret < 0) { + dev_err(&pdev->dev, + "BACKUP_BATTERY_CTRL update failed, %d\n", ret); + return ret; + } + + ret = palmas_update_bits(palmas, PALMAS_PMU_CONTROL_BASE, + PALMAS_BACKUP_BATTERY_CTRL, + PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN, + PALMAS_BACKUP_BATTERY_CTRL_BB_CHG_EN); + if (ret < 0) { + dev_err(&pdev->dev, + "BACKUP_BATTERY_CTRL update failed, %d\n", ret); + return ret; + } + } + /* Start RTC */ ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG, PALMAS_RTC_CTRL_REG_STOP_RTC, From 7707bda3ee4aa4415090755c22e5a9c03fc530dc Mon Sep 17 00:00:00 2001 From: Alexander Holler Date: Wed, 11 Sep 2013 14:24:20 -0700 Subject: [PATCH 210/303] drivers/rtc/rtc-hid-sensor-time.c: improve error handling when rtc register fails Stop processing hid input when registering the RTC fails and handle a NULL returned from devm_rtc_device_register() as a failure too. Signed-off-by: Alexander Holler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-hid-sensor-time.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-hid-sensor-time.c b/drivers/rtc/rtc-hid-sensor-time.c index b5a2874b15ef..4e2a81854f51 100644 --- a/drivers/rtc/rtc-hid-sensor-time.c +++ b/drivers/rtc/rtc-hid-sensor-time.c @@ -279,9 +279,11 @@ static int hid_time_probe(struct platform_device *pdev) "hid-sensor-time", &hid_time_rtc_ops, THIS_MODULE); - if (IS_ERR(time_state->rtc)) { + if (IS_ERR_OR_NULL(time_state->rtc)) { + ret = time_state->rtc ? PTR_ERR(time_state->rtc) : -ENODEV; + time_state->rtc = NULL; + sensor_hub_remove_callback(hsdev, HID_USAGE_SENSOR_TIME); dev_err(&pdev->dev, "rtc device register failed!\n"); - return PTR_ERR(time_state->rtc); } return ret; From 1748cbf7f7c464593232cde914f5a103181a83b5 Mon Sep 17 00:00:00 2001 From: Sangjung Woo Date: Wed, 11 Sep 2013 14:24:21 -0700 Subject: [PATCH 211/303] drivers/rtc/rtc-max77686.c: Fix wrong register Fix a read of the wrong register when checking whether the RTC timer has reached the alarm time. Signed-off-by: Sangjung Woo Signed-off-by: Myugnjoo Ham Reviewed-by: Jonghwa Lee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-max77686.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-max77686.c b/drivers/rtc/rtc-max77686.c index 9915cb96014b..9efe118a28ba 100644 --- a/drivers/rtc/rtc-max77686.c +++ b/drivers/rtc/rtc-max77686.c @@ -240,9 +240,9 @@ static int max77686_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alrm) } alrm->pending = 0; - ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS1, &val); + ret = regmap_read(info->max77686->regmap, MAX77686_REG_STATUS2, &val); if (ret < 0) { - dev_err(info->dev, "%s:%d fail to read status1 reg(%d)\n", + dev_err(info->dev, "%s:%d fail to read status2 reg(%d)\n", __func__, __LINE__, ret); goto out; } From 0ebbf4397664d66f3e35503f2f3778a5e6377cbf Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:24:22 -0700 Subject: [PATCH 212/303] drivers/rtc/rtc-nuc900.c: use NULL instead of 0 check_rtc_access_enable() returns pointer, thus NULL should be used instead of 0 in order to fix the following sparse warning: drivers/rtc/rtc-nuc900.c:102:16: warning: Using plain integer as NULL pointer Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-nuc900.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-nuc900.c b/drivers/rtc/rtc-nuc900.c index 22861c5e0c59..248653c74b80 100644 --- a/drivers/rtc/rtc-nuc900.c +++ b/drivers/rtc/rtc-nuc900.c @@ -99,7 +99,7 @@ static int *check_rtc_access_enable(struct nuc900_rtc *nuc900_rtc) if (!timeout) return ERR_PTR(-EPERM); - return 0; + return NULL; } static int nuc900_rtc_bcd2bin(unsigned int timereg, From 28984c7d94c27b993d09d4f2a1a2c36bfd26fd23 Mon Sep 17 00:00:00 2001 From: Xianglong Du Date: Wed, 11 Sep 2013 14:24:23 -0700 Subject: [PATCH 213/303] drivers/rtc/rtc-sirfsoc.c: fix kernel warning during wakeup enable_irq_wake() might fail, if so, we will see kernel warning in resume entries due to it always calls disable_irq_wake(). WARNING: at kernel/irq/manage.c:529 irq_set_irq_wake+0xc4/0xf0() Unbalanced IRQ 52 wake disable Modules linked in: ipv6 libcomposite configfs CPU: 0 PID: 1591 Comm: ash Tainted: G W 3.10.0-00854-gdbd86d4-dirty #100 (unwind_backtrace+0x0/0xf8) from (show_stack+0x10/0x14) (show_stack+0x10/0x14) from (warn_slowpath_common+0x54/0x68) (warn_slowpath_common+0x54/0x68) from (warn_slowpath_fmt+0x30/0x40) (warn_slowpath_fmt+0x30/0x40) from (irq_set_irq_wake+0xc4/0xf0) (irq_set_irq_wake+0xc4/0xf0) from (sirfsoc_rtc_restore+0x30/0x38) (sirfsoc_rtc_restore+0x30/0x38) from (platform_pm_restore+0x2c/0x50) (platform_pm_restore+0x2c/0x50) from (dpm_run_callback.clone.6+0x30/0xb0) (dpm_run_callback.clone.6+0x30/0xb0) from (device_resume+0x88/0x134) (device_resume+0x88/0x134) from (dpm_resume+0x114/0x230) (dpm_resume+0x114/0x230) from (hibernation_snapshot+0x178/0x1d0) (hibernation_snapshot+0x178/0x1d0) from (hibernate+0x130/0x1dc) (hibernate+0x130/0x1dc) from (state_store+0xb4/0xc0) (state_store+0xb4/0xc0) from (kobj_attr_store+0x14/0x20) (kobj_attr_store+0x14/0x20) from (sysfs_write_file+0xfc/0x17c) (sysfs_write_file+0xfc/0x17c) from (vfs_write+0xc8/0x194) (vfs_write+0xc8/0x194) from (SyS_write+0x40/0x6c) (SyS_write+0x40/0x6c) from (ret_fast_syscall+0x0/0x30) To avoid unbalanced "IRQ wake disable", ensure that disable_irq_wake() is called only when enable_irq_wake() have been successfully enabled. Signed-off-by: Xianglong Du Signed-off-by: Barry Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-sirfsoc.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/rtc/rtc-sirfsoc.c b/drivers/rtc/rtc-sirfsoc.c index aa7ed4b5f7f0..63460cf80f1b 100644 --- a/drivers/rtc/rtc-sirfsoc.c +++ b/drivers/rtc/rtc-sirfsoc.c @@ -44,6 +44,7 @@ struct sirfsoc_rtc_drv { struct rtc_device *rtc; u32 rtc_base; u32 irq; + unsigned irq_wake; /* Overflow for every 8 years extra time */ u32 overflow_rtc; #ifdef CONFIG_PM @@ -355,8 +356,8 @@ static int sirfsoc_rtc_suspend(struct device *dev) rtcdrv->saved_counter = sirfsoc_rtc_iobrg_readl(rtcdrv->rtc_base + RTC_CN); rtcdrv->saved_overflow_rtc = rtcdrv->overflow_rtc; - if (device_may_wakeup(&pdev->dev)) - enable_irq_wake(rtcdrv->irq); + if (device_may_wakeup(&pdev->dev) && !enable_irq_wake(rtcdrv->irq)) + rtcdrv->irq_wake = 1; return 0; } @@ -423,8 +424,10 @@ static int sirfsoc_rtc_resume(struct device *dev) struct platform_device *pdev = to_platform_device(dev); struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); sirfsoc_rtc_thaw(dev); - if (device_may_wakeup(&pdev->dev)) + if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) { disable_irq_wake(rtcdrv->irq); + rtcdrv->irq_wake = 0; + } return 0; } @@ -434,8 +437,10 @@ static int sirfsoc_rtc_restore(struct device *dev) struct platform_device *pdev = to_platform_device(dev); struct sirfsoc_rtc_drv *rtcdrv = platform_get_drvdata(pdev); - if (device_may_wakeup(&pdev->dev)) + if (device_may_wakeup(&pdev->dev) && rtcdrv->irq_wake) { disable_irq_wake(rtcdrv->irq); + rtcdrv->irq_wake = 0; + } return 0; } From 25e2818e385cf3e3198599307f08e044a7c1be97 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Wed, 11 Sep 2013 14:24:24 -0700 Subject: [PATCH 214/303] drivers/rtc/rtc-ds1742.c: use devm_ioremap_resource() Replace devm_request_mem_region() and devm_ioremap() with devm_ioremap_resource(). Signed-off-by: Alexander Shiyan Cc: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1742.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index eccdc62ae1c0..9fba0ae2e768 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -56,7 +56,6 @@ struct rtc_plat_data { void __iomem *ioaddr_nvram; void __iomem *ioaddr_rtc; size_t size_nvram; - size_t size; unsigned long last_jiffies; struct bin_attribute nvram_attr; }; @@ -168,22 +167,17 @@ static int ds1742_rtc_probe(struct platform_device *pdev) void __iomem *ioaddr; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, - pdev->name)) - return -EBUSY; - ioaddr = devm_ioremap(&pdev->dev, res->start, pdata->size); - if (!ioaddr) - return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ioaddr)) + return PTR_ERR(ioaddr); pdata->ioaddr_nvram = ioaddr; - pdata->size_nvram = pdata->size - RTC_SIZE; + pdata->size_nvram = resource_size(res) - RTC_SIZE; pdata->ioaddr_rtc = ioaddr + pdata->size_nvram; sysfs_bin_attr_init(&pdata->nvram_attr); From 2cbc21877adf6adc69ca1fbc5fd1edbd7e1e3c59 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Wed, 11 Sep 2013 14:24:25 -0700 Subject: [PATCH 215/303] drivers/rtc/rtc-ds1742.c: remove unused field "rtc" from private structure Private field "rtc" is not used outside "probe", so there is no reason to keep it. Signed-off-by: Alexander Shiyan Cc: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1742.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index 9fba0ae2e768..139934ff14fc 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -52,7 +52,6 @@ #define RTC_BATT_FLAG 0x80 struct rtc_plat_data { - struct rtc_device *rtc; void __iomem *ioaddr_nvram; void __iomem *ioaddr_rtc; size_t size_nvram; @@ -206,7 +205,6 @@ static int ds1742_rtc_probe(struct platform_device *pdev) &ds1742_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) return PTR_ERR(rtc); - pdata->rtc = rtc; ret = sysfs_create_bin_file(&pdev->dev.kobj, &pdata->nvram_attr); From 1735be4b822e8e3808f461372ff3942824790172 Mon Sep 17 00:00:00 2001 From: Alexander Shiyan Date: Wed, 11 Sep 2013 14:24:26 -0700 Subject: [PATCH 216/303] drivers/rtc/rtc-ds1742.c: report to RTC core if retrieved time is invalid Let RTC core decide if the retrieved time is invalid, instead of processing errors in the driver. Signed-off-by: Alexander Shiyan Cc: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1742.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index 139934ff14fc..17b73fdc3b6e 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -115,11 +115,7 @@ static int ds1742_rtc_read_time(struct device *dev, struct rtc_time *tm) /* year is 1900 + tm->tm_year */ tm->tm_year = bcd2bin(year) + bcd2bin(century) * 100 - 1900; - if (rtc_valid_tm(tm) < 0) { - dev_err(dev, "retrieved date/time is not valid.\n"); - rtc_time_to_tm(0, tm); - } - return 0; + return rtc_valid_tm(tm); } static const struct rtc_class_ops ds1742_rtc_ops = { From 7c1d69ee11b8986c40a53d8e2238204fc86f5b33 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 11 Sep 2013 14:24:27 -0700 Subject: [PATCH 217/303] rtc: simplify devm_request_mem_region/devm_ioremap Convert the composition of devm_request_mem_region and devm_ioremap to a single call to devm_ioremap_resource. The associated call to platform_get_resource is also simplified and moved next to the new call to devm_ioremap_resource. This was done using a combination of the semantic patches devm_ioremap_resource.cocci and devm_request_and_ioremap.cocci, found in the scripts/coccinelle/api directory. In rtc-lpc32xx.c and rtc-mv.c, the local variable size is no longer needed. In rtc-ds1511.c the size field of the local structure is not useful any more, and is deleted. Signed-off-by: Julia Lawall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ds1511.c | 17 +++++------------ drivers/rtc/rtc-ds1553.c | 13 ++++--------- drivers/rtc/rtc-ep93xx.c | 14 +++----------- drivers/rtc/rtc-imxdi.c | 16 ++++------------ drivers/rtc/rtc-lpc32xx.c | 24 ++++-------------------- drivers/rtc/rtc-mv.c | 17 ++++------------- drivers/rtc/rtc-mxc.c | 14 ++++---------- drivers/rtc/rtc-stk17ta8.c | 15 +++++---------- drivers/rtc/rtc-tx4939.c | 14 ++++---------- 9 files changed, 37 insertions(+), 107 deletions(-) diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 308a8fefe76f..bc7b4fcf603c 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -89,7 +89,6 @@ enum ds1511reg { struct rtc_plat_data { struct rtc_device *rtc; void __iomem *ioaddr; /* virtual base address */ - int size; /* amount of memory mapped */ int irq; unsigned int irqen; int alrm_sec; @@ -479,20 +478,14 @@ static int ds1511_rtc_probe(struct platform_device *pdev) struct rtc_plat_data *pdata; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - pdata->size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, pdata->size, - pdev->name)) - return -EBUSY; - ds1511_base = devm_ioremap(&pdev->dev, res->start, pdata->size); - if (!ds1511_base) - return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ds1511_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ds1511_base)) + return PTR_ERR(ds1511_base); pdata->ioaddr = ds1511_base; pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 8c6c952e90b1..fd31571941f5 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -285,19 +285,14 @@ static int ds1553_rtc_probe(struct platform_device *pdev) void __iomem *ioaddr; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE, - pdev->name)) - return -EBUSY; - ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE); - if (!ioaddr) - return -ENOMEM; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ioaddr)) + return PTR_ERR(ioaddr); pdata->ioaddr = ioaddr; pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/rtc/rtc-ep93xx.c b/drivers/rtc/rtc-ep93xx.c index 549b3c3792d2..580e7b56bde8 100644 --- a/drivers/rtc/rtc-ep93xx.c +++ b/drivers/rtc/rtc-ep93xx.c @@ -138,17 +138,9 @@ static int ep93xx_rtc_probe(struct platform_device *pdev) return -ENOMEM; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENXIO; - - if (!devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), pdev->name)) - return -EBUSY; - - ep93xx_rtc->mmio_base = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!ep93xx_rtc->mmio_base) - return -ENXIO; + ep93xx_rtc->mmio_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ep93xx_rtc->mmio_base)) + return PTR_ERR(ep93xx_rtc->mmio_base); pdev->dev.platform_data = ep93xx_rtc; platform_set_drvdata(pdev, ep93xx_rtc); diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index d3a8c8e255de..abd7f9091f34 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -375,24 +375,16 @@ static int __init dryice_rtc_probe(struct platform_device *pdev) struct imxdi_dev *imxdi; int rc; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - imxdi = devm_kzalloc(&pdev->dev, sizeof(*imxdi), GFP_KERNEL); if (!imxdi) return -ENOMEM; imxdi->pdev = pdev; - if (!devm_request_mem_region(&pdev->dev, res->start, resource_size(res), - pdev->name)) - return -EBUSY; - - imxdi->ioaddr = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (imxdi->ioaddr == NULL) - return -ENOMEM; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + imxdi->ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(imxdi->ioaddr)) + return PTR_ERR(imxdi->ioaddr); spin_lock_init(&imxdi->irq_lock); diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index 8276ae94a2a9..bfdbcb82d069 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -201,16 +201,9 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) { struct resource *res; struct lpc32xx_rtc *rtc; - resource_size_t size; int rtcirq; u32 tmp; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - dev_err(&pdev->dev, "Can't get memory resource\n"); - return -ENOENT; - } - rtcirq = platform_get_irq(pdev, 0); if (rtcirq < 0 || rtcirq >= NR_IRQS) { dev_warn(&pdev->dev, "Can't get interrupt resource\n"); @@ -224,19 +217,10 @@ static int lpc32xx_rtc_probe(struct platform_device *pdev) } rtc->irq = rtcirq; - size = resource_size(res); - - if (!devm_request_mem_region(&pdev->dev, res->start, size, - pdev->name)) { - dev_err(&pdev->dev, "RTC registers are not free\n"); - return -EBUSY; - } - - rtc->rtc_base = devm_ioremap(&pdev->dev, res->start, size); - if (!rtc->rtc_base) { - dev_err(&pdev->dev, "Can't map memory\n"); - return -ENOMEM; - } + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + rtc->rtc_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(rtc->rtc_base)) + return PTR_ERR(rtc->rtc_base); spin_lock_init(&rtc->lock); diff --git a/drivers/rtc/rtc-mv.c b/drivers/rtc/rtc-mv.c index baab802f2153..d536c5962c99 100644 --- a/drivers/rtc/rtc-mv.c +++ b/drivers/rtc/rtc-mv.c @@ -221,26 +221,17 @@ static int __init mv_rtc_probe(struct platform_device *pdev) { struct resource *res; struct rtc_plat_data *pdata; - resource_size_t size; u32 rtc_time; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - size = resource_size(res); - if (!devm_request_mem_region(&pdev->dev, res->start, size, - pdev->name)) - return -EBUSY; - - pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, size); - if (!pdata->ioaddr) - return -ENOMEM; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pdata->ioaddr)) + return PTR_ERR(pdata->ioaddr); pdata->clk = devm_clk_get(&pdev->dev, NULL); /* Not all SoCs require a clock.*/ diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index ab87bacb8f88..50c572645546 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -377,22 +377,16 @@ static int mxc_rtc_probe(struct platform_device *pdev) unsigned long rate; int ret; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; pdata->devtype = pdev->id_entry->driver_data; - if (!devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), pdev->name)) - return -EBUSY; - - pdata->ioaddr = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pdata->ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pdata->ioaddr)) + return PTR_ERR(pdata->ioaddr); pdata->clk = devm_clk_get(&pdev->dev, NULL); if (IS_ERR(pdata->clk)) { diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index af5e97e3f272..a176ba614683 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -294,19 +294,14 @@ static int stk17ta8_rtc_probe(struct platform_device *pdev) void __iomem *ioaddr; int ret = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; - pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) return -ENOMEM; - if (!devm_request_mem_region(&pdev->dev, res->start, RTC_REG_SIZE, - pdev->name)) - return -EBUSY; - ioaddr = devm_ioremap(&pdev->dev, res->start, RTC_REG_SIZE); - if (!ioaddr) - return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + ioaddr = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(ioaddr)) + return PTR_ERR(ioaddr); pdata->ioaddr = ioaddr; pdata->irq = platform_get_irq(pdev, 0); diff --git a/drivers/rtc/rtc-tx4939.c b/drivers/rtc/rtc-tx4939.c index f9a0677e4e3b..4f87234e0dee 100644 --- a/drivers/rtc/rtc-tx4939.c +++ b/drivers/rtc/rtc-tx4939.c @@ -244,9 +244,6 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev) struct resource *res; int irq, ret; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) - return -ENODEV; irq = platform_get_irq(pdev, 0); if (irq < 0) return -ENODEV; @@ -255,13 +252,10 @@ static int __init tx4939_rtc_probe(struct platform_device *pdev) return -ENOMEM; platform_set_drvdata(pdev, pdata); - if (!devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), pdev->name)) - return -EBUSY; - pdata->rtcreg = devm_ioremap(&pdev->dev, res->start, - resource_size(res)); - if (!pdata->rtcreg) - return -EBUSY; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + pdata->rtcreg = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(pdata->rtcreg)) + return PTR_ERR(pdata->rtcreg); spin_lock_init(&pdata->lock); tx4939_rtc_cmd(pdata->rtcreg, TX4939_RTCCTL_COMMAND_NOP); From 2c92057e45c2d60f859ca3606cfd402c48785d82 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Wed, 11 Sep 2013 14:24:28 -0700 Subject: [PATCH 218/303] hfsplus: add necessary declarations for POSIX ACLs support This patchset implements POSIX ACLs support in hfsplus driver. Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, which are part of the NFSv4 standard. HFS+ stores ACLs in the form of specially named extended attributes (com.apple.system.Security). But this patchset doesn't use "com.apple.system.Security" extended attributes. It implements support of POSIX ACLs in the form of extended attributes with names "system.posix_acl_access" and "system.posix_acl_default". These xattrs are treated only under Linux. POSIX ACLs doesn't mean something under Mac OS X. Thereby, this patch set provides opportunity to use POSIX ACLs under Linux on HFS+ filesystem. This patch: Add CONFIG_HFSPLUS_FS_POSIX_ACL kernel configuration option, DBG_ACL_MOD debugging flag and acl.h file with declaration of essential functions for support POSIX ACLs in hfsplus driver. Signed-off-by: Vyacheslav Dubeyko Cc: Al Viro Cc: Christoph Hellwig Cc: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/Kconfig | 18 ++++++++++++++++++ fs/hfsplus/acl.h | 30 ++++++++++++++++++++++++++++++ fs/hfsplus/hfsplus_fs.h | 1 + 3 files changed, 49 insertions(+) create mode 100644 fs/hfsplus/acl.h diff --git a/fs/hfsplus/Kconfig b/fs/hfsplus/Kconfig index a63371815aab..24bc20fd42f7 100644 --- a/fs/hfsplus/Kconfig +++ b/fs/hfsplus/Kconfig @@ -11,3 +11,21 @@ config HFSPLUS_FS MacOS 8. It includes all Mac specific filesystem data such as data forks and creator codes, but it also has several UNIX style features such as file ownership and permissions. + +config HFSPLUS_FS_POSIX_ACL + bool "HFS+ POSIX Access Control Lists" + depends on HFSPLUS_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the POSIX ACLs for + Linux website . + + It needs to understand that POSIX ACLs are treated only under + Linux. POSIX ACLs doesn't mean something under Mac OS X. + Mac OS X beginning with version 10.4 ("Tiger") support NFSv4 ACLs, + which are part of the NFSv4 standard. + + If you don't know what Access Control Lists are, say N diff --git a/fs/hfsplus/acl.h b/fs/hfsplus/acl.h new file mode 100644 index 000000000000..07c0d4947527 --- /dev/null +++ b/fs/hfsplus/acl.h @@ -0,0 +1,30 @@ +/* + * linux/fs/hfsplus/acl.h + * + * Vyacheslav Dubeyko + * + * Handler for Posix Access Control Lists (ACLs) support. + */ + +#include + +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + +/* posix_acl.c */ +struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type); +extern int hfsplus_posix_acl_chmod(struct inode *); +extern int hfsplus_init_posix_acl(struct inode *, struct inode *); + +#else /* CONFIG_HFSPLUS_FS_POSIX_ACL */ +#define hfsplus_get_posix_acl NULL + +static inline int hfsplus_posix_acl_chmod(struct inode *inode) +{ + return 0; +} + +static inline int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} +#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index ede79317cfb8..2b9cd01696e2 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -30,6 +30,7 @@ #define DBG_EXTENT 0x00000020 #define DBG_BITMAP 0x00000040 #define DBG_ATTR_MOD 0x00000080 +#define DBG_ACL_MOD 0x00000100 #if 0 #define DBG_MASK (DBG_EXTENT|DBG_INODE|DBG_BNODE_MOD) From eef80d4ad1399067f26538a7dd56ff3df71e9278 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Wed, 11 Sep 2013 14:24:29 -0700 Subject: [PATCH 219/303] hfsplus: implement POSIX ACLs support Implement POSIX ACLs support in hfsplus driver. Signed-off-by: Vyacheslav Dubeyko Cc: Al Viro Cc: Christoph Hellwig Cc: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/posix_acl.c | 274 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 fs/hfsplus/posix_acl.c diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c new file mode 100644 index 000000000000..b609cc14c72e --- /dev/null +++ b/fs/hfsplus/posix_acl.c @@ -0,0 +1,274 @@ +/* + * linux/fs/hfsplus/posix_acl.c + * + * Vyacheslav Dubeyko + * + * Handler for Posix Access Control Lists (ACLs) support. + */ + +#include "hfsplus_fs.h" +#include "xattr.h" +#include "acl.h" + +struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) +{ + struct posix_acl *acl; + char *xattr_name; + char *value = NULL; + ssize_t size; + + acl = get_cached_acl(inode, type); + if (acl != ACL_NOT_CACHED) + return acl; + + switch (type) { + case ACL_TYPE_ACCESS: + xattr_name = POSIX_ACL_XATTR_ACCESS; + break; + case ACL_TYPE_DEFAULT: + xattr_name = POSIX_ACL_XATTR_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + size = __hfsplus_getxattr(inode, xattr_name, NULL, 0); + + if (size > 0) { + value = (char *)hfsplus_alloc_attr_entry(); + if (unlikely(!value)) + return ERR_PTR(-ENOMEM); + size = __hfsplus_getxattr(inode, xattr_name, value, size); + } + + if (size > 0) + acl = posix_acl_from_xattr(&init_user_ns, value, size); + else if (size == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(size); + + hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); + + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + + return acl; +} + +static int hfsplus_set_posix_acl(struct inode *inode, + int type, + struct posix_acl *acl) +{ + int err; + char *xattr_name; + size_t size = 0; + char *value = NULL; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + xattr_name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + err = posix_acl_equiv_mode(acl, &inode->i_mode); + if (err < 0) + return err; + } + err = 0; + break; + + case ACL_TYPE_DEFAULT: + xattr_name = POSIX_ACL_XATTR_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + if (unlikely(size > HFSPLUS_MAX_INLINE_DATA_SIZE)) + return -ENOMEM; + value = (char *)hfsplus_alloc_attr_entry(); + if (unlikely(!value)) + return -ENOMEM; + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); + if (unlikely(err < 0)) + goto end_set_acl; + } + + err = __hfsplus_setxattr(inode, xattr_name, value, size, 0); + +end_set_acl: + hfsplus_destroy_attr_entry((hfsplus_attr_entry *)value); + + if (!err) + set_cached_acl(inode, type, acl); + + return err; +} + +int hfsplus_init_posix_acl(struct inode *inode, struct inode *dir) +{ + int err = 0; + struct posix_acl *acl = NULL; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, dir->ino %lu\n", + __func__, inode->i_ino, dir->i_ino); + + if (S_ISLNK(inode->i_mode)) + return 0; + + acl = hfsplus_get_posix_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + if (S_ISDIR(inode->i_mode)) { + err = hfsplus_set_posix_acl(inode, + ACL_TYPE_DEFAULT, + acl); + if (unlikely(err)) + goto init_acl_cleanup; + } + + err = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (unlikely(err < 0)) + return err; + + if (err > 0) + err = hfsplus_set_posix_acl(inode, + ACL_TYPE_ACCESS, + acl); + } else + inode->i_mode &= ~current_umask(); + +init_acl_cleanup: + posix_acl_release(acl); + return err; +} + +int hfsplus_posix_acl_chmod(struct inode *inode) +{ + int err; + struct posix_acl *acl; + + hfs_dbg(ACL_MOD, "[%s]: ino %lu\n", __func__, inode->i_ino); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + acl = hfsplus_get_posix_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + err = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); + if (unlikely(err)) + return err; + + err = hfsplus_set_posix_acl(inode, ACL_TYPE_ACCESS, acl); + posix_acl_release(acl); + return err; +} + +static int hfsplus_xattr_get_posix_acl(struct dentry *dentry, + const char *name, + void *buffer, + size_t size, + int type) +{ + int err = 0; + struct posix_acl *acl; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, buffer %p, size %zu, type %#x\n", + __func__, dentry->d_inode->i_ino, buffer, size, type); + + if (strcmp(name, "") != 0) + return -EINVAL; + + acl = hfsplus_get_posix_acl(dentry->d_inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); + posix_acl_release(acl); + + return err; +} + +static int hfsplus_xattr_set_posix_acl(struct dentry *dentry, + const char *name, + const void *value, + size_t size, + int flags, + int type) +{ + int err = 0; + struct inode *inode = dentry->d_inode; + struct posix_acl *acl = NULL; + + hfs_dbg(ACL_MOD, + "[%s]: ino %lu, value %p, size %zu, flags %#x, type %#x\n", + __func__, inode->i_ino, value, size, flags, type); + + if (strcmp(name, "") != 0) + return -EINVAL; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + err = posix_acl_valid(acl); + if (err) + goto end_xattr_set_acl; + } + } + + err = hfsplus_set_posix_acl(inode, type, acl); + +end_xattr_set_acl: + posix_acl_release(acl); + return err; +} + +static size_t hfsplus_xattr_list_posix_acl(struct dentry *dentry, + char *list, + size_t list_size, + const char *name, + size_t name_len, + int type) +{ + /* + * This method is not used. + * It is used hfsplus_listxattr() instead of generic_listxattr(). + */ + return -EOPNOTSUPP; +} + +const struct xattr_handler hfsplus_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .flags = ACL_TYPE_ACCESS, + .list = hfsplus_xattr_list_posix_acl, + .get = hfsplus_xattr_get_posix_acl, + .set = hfsplus_xattr_set_posix_acl, +}; + +const struct xattr_handler hfsplus_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .flags = ACL_TYPE_DEFAULT, + .list = hfsplus_xattr_list_posix_acl, + .get = hfsplus_xattr_get_posix_acl, + .set = hfsplus_xattr_set_posix_acl, +}; From b4c1107cc962613ea3572e5abba861a35d494b98 Mon Sep 17 00:00:00 2001 From: Vyacheslav Dubeyko Date: Wed, 11 Sep 2013 14:24:30 -0700 Subject: [PATCH 220/303] hfsplus: integrate POSIX ACLs support into driver Integrate implemented POSIX ACLs support into hfsplus driver. Signed-off-by: Vyacheslav Dubeyko Cc: Al Viro Cc: Christoph Hellwig Cc: Hin-Tak Leung Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/Makefile | 2 ++ fs/hfsplus/dir.c | 4 +++ fs/hfsplus/inode.c | 11 +++++++ fs/hfsplus/xattr.c | 62 +++++++++++++++++++++++++++++++++---- fs/hfsplus/xattr.h | 33 ++++++++------------ fs/hfsplus/xattr_security.c | 13 ++++++++ 6 files changed, 99 insertions(+), 26 deletions(-) diff --git a/fs/hfsplus/Makefile b/fs/hfsplus/Makefile index 09d278bb7b91..683fca2e5e65 100644 --- a/fs/hfsplus/Makefile +++ b/fs/hfsplus/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_HFSPLUS_FS) += hfsplus.o hfsplus-objs := super.o options.o inode.o ioctl.o extents.o catalog.o dir.o btree.o \ bnode.o brec.o bfind.o tables.o unicode.o wrapper.o bitmap.o part_tbl.o \ attributes.o xattr.o xattr_user.o xattr_security.o xattr_trusted.o + +hfsplus-$(CONFIG_HFSPLUS_FS_POSIX_ACL) += posix_acl.o diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d8ce4bd17fc5..4a4fea002673 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -16,6 +16,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" +#include "acl.h" static inline void hfsplus_instantiate(struct dentry *dentry, struct inode *inode, u32 cnid) @@ -529,6 +530,9 @@ const struct inode_operations hfsplus_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, .removexattr = hfsplus_removexattr, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + .get_acl = hfsplus_get_posix_acl, +#endif }; const struct file_operations hfsplus_dir_operations = { diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index f833d35630ab..4d2edaea891c 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -19,6 +19,7 @@ #include "hfsplus_fs.h" #include "hfsplus_raw.h" #include "xattr.h" +#include "acl.h" static int hfsplus_readpage(struct file *file, struct page *page) { @@ -316,6 +317,13 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) setattr_copy(inode, attr); mark_inode_dirty(inode); + + if (attr->ia_valid & ATTR_MODE) { + error = hfsplus_posix_acl_chmod(inode); + if (unlikely(error)) + return error; + } + return 0; } @@ -383,6 +391,9 @@ static const struct inode_operations hfsplus_file_inode_operations = { .getxattr = generic_getxattr, .listxattr = hfsplus_listxattr, .removexattr = hfsplus_removexattr, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + .get_acl = hfsplus_get_posix_acl, +#endif }; static const struct file_operations hfsplus_file_operations = { diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index f66346155df5..bd8471fb9a6a 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -8,11 +8,16 @@ #include "hfsplus_fs.h" #include "xattr.h" +#include "acl.h" const struct xattr_handler *hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + &hfsplus_xattr_acl_access_handler, + &hfsplus_xattr_acl_default_handler, +#endif &hfsplus_xattr_security_handler, NULL }; @@ -46,11 +51,58 @@ static inline int is_known_namespace(const char *name) return true; } +static int can_set_system_xattr(struct inode *inode, const char *name, + const void *value, size_t size) +{ +#ifdef CONFIG_HFSPLUS_FS_POSIX_ACL + struct posix_acl *acl; + int err; + + if (!inode_owner_or_capable(inode)) + return -EPERM; + + /* + * POSIX_ACL_XATTR_ACCESS is tied to i_mode + */ + if (strcmp(name, POSIX_ACL_XATTR_ACCESS) == 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + err = posix_acl_equiv_mode(acl, &inode->i_mode); + posix_acl_release(acl); + if (err < 0) + return err; + mark_inode_dirty(inode); + } + /* + * We're changing the ACL. Get rid of the cached one + */ + forget_cached_acl(inode, ACL_TYPE_ACCESS); + + return 0; + } else if (strcmp(name, POSIX_ACL_XATTR_DEFAULT) == 0) { + acl = posix_acl_from_xattr(&init_user_ns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + posix_acl_release(acl); + + /* + * We're changing the default ACL. Get rid of the cached one + */ + forget_cached_acl(inode, ACL_TYPE_DEFAULT); + + return 0; + } +#endif /* CONFIG_HFSPLUS_FS_POSIX_ACL */ + return -EOPNOTSUPP; +} + static int can_set_xattr(struct inode *inode, const char *name, const void *value, size_t value_len) { if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return -EOPNOTSUPP; /* TODO: implement ACL support */ + return can_set_system_xattr(inode, name, value, value_len); if (!strncmp(name, XATTR_MAC_OSX_PREFIX, XATTR_MAC_OSX_PREFIX_LEN)) { /* @@ -253,11 +305,10 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } -static ssize_t hfsplus_getxattr_finder_info(struct dentry *dentry, +static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, void *value, size_t size) { ssize_t res = 0; - struct inode *inode = dentry->d_inode; struct hfs_find_data fd; u16 entry_type; u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); @@ -304,10 +355,9 @@ end_getxattr_finder_info: return res; } -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; struct hfs_find_data fd; hfsplus_attr_entry *entry; __be32 xattr_record_type; @@ -333,7 +383,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, } if (!strcmp_xattr_finder_info(name)) - return hfsplus_getxattr_finder_info(dentry, value, size); + return hfsplus_getxattr_finder_info(inode, value, size); if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 847b695b984d..841b5698c0fc 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -14,8 +14,8 @@ extern const struct xattr_handler hfsplus_xattr_osx_handler; extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; -/*extern const struct xattr_handler hfsplus_xattr_acl_access_handler;*/ -/*extern const struct xattr_handler hfsplus_xattr_acl_default_handler;*/ +extern const struct xattr_handler hfsplus_xattr_acl_access_handler; +extern const struct xattr_handler hfsplus_xattr_acl_default_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; extern const struct xattr_handler *hfsplus_xattr_handlers[]; @@ -29,9 +29,17 @@ static inline int hfsplus_setxattr(struct dentry *dentry, const char *name, return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); } -ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, +ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size); +static inline ssize_t hfsplus_getxattr(struct dentry *dentry, + const char *name, + void *value, + size_t size) +{ + return __hfsplus_getxattr(dentry->d_inode, name, value, size); +} + ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); int hfsplus_removexattr(struct dentry *dentry, const char *name); @@ -39,22 +47,7 @@ int hfsplus_removexattr(struct dentry *dentry, const char *name); int hfsplus_init_security(struct inode *inode, struct inode *dir, const struct qstr *qstr); -static inline int hfsplus_init_acl(struct inode *inode, struct inode *dir) -{ - /*TODO: implement*/ - return 0; -} - -static inline int hfsplus_init_inode_security(struct inode *inode, - struct inode *dir, - const struct qstr *qstr) -{ - int err; - - err = hfsplus_init_acl(inode, dir); - if (!err) - err = hfsplus_init_security(inode, dir, qstr); - return err; -} +int hfsplus_init_inode_security(struct inode *inode, struct inode *dir, + const struct qstr *qstr); #endif diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 83b842f113c5..00722765ea79 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -9,6 +9,7 @@ #include #include "hfsplus_fs.h" #include "xattr.h" +#include "acl.h" static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) @@ -96,6 +97,18 @@ int hfsplus_init_security(struct inode *inode, struct inode *dir, &hfsplus_initxattrs, NULL); } +int hfsplus_init_inode_security(struct inode *inode, + struct inode *dir, + const struct qstr *qstr) +{ + int err; + + err = hfsplus_init_posix_acl(inode, dir); + if (!err) + err = hfsplus_init_security(inode, dir, qstr); + return err; +} + const struct xattr_handler hfsplus_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = hfsplus_security_listxattr, From 73af963f9f3036dffed55c3a2898598186db1045 Mon Sep 17 00:00:00 2001 From: Mark Grondona Date: Wed, 11 Sep 2013 14:24:31 -0700 Subject: [PATCH 221/303] __ptrace_may_access() should not deny sub-threads __ptrace_may_access() checks get_dumpable/ptrace_has_cap/etc if task != current, this can can lead to surprising results. For example, a sub-thread can't readlink("/proc/self/exe") if the executable is not readable. setup_new_exec()->would_dump() notices that inode_permission(MAY_READ) fails and then it does set_dumpable(suid_dumpable). After that get_dumpable() fails. (It is not clear why proc_pid_readlink() checks get_dumpable(), perhaps we could add PTRACE_MODE_NODUMPABLE) Change __ptrace_may_access() to use same_thread_group() instead of "task == current". Any security check is pointless when the tasks share the same ->mm. Signed-off-by: Mark Grondona Signed-off-by: Ben Woodard Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..dd562e9aa2c8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) */ int dumpable = 0; /* Don't let security modules deny introspection */ - if (task == current) + if (same_thread_group(task, current)) return 0; rcu_read_lock(); tcred = __task_cred(task); From 65aafb1e7484b7434a0c1d4c593191ebe5776a2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Wed, 11 Sep 2013 14:24:32 -0700 Subject: [PATCH 222/303] coredump: add new %P variable in core_pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new %P variable to be used in core_pattern. This variable contains the global PID (PID in the init namespace) as %p contains the PID in the current namespace which isn't always what we want. The main use for this is to make it easier to handle crashes that happened within a container. With that new variables it's possible to have the crashes dumped into the container or forwarded to the host with the right PID (from the host's point of view). Signed-off-by: Stéphane Graber Reported-by: Hans Feldt Cc: Alexander Viro Cc: Eric W. Biederman Cc: Andy Whitcroft Acked-by: Serge E. Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/kernel.txt | 1 + fs/coredump.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index ab7d16efa96b..9d4c1d18ad44 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -182,6 +182,7 @@ core_pattern is used to specify a core dumpfile pattern name. % '%' is dropped %% output one '%' %p pid + %P global pid (init PID namespace) %u uid %g gid %d dump mode, matches PR_SET_DUMPABLE and diff --git a/fs/coredump.c b/fs/coredump.c index 72f816d6cad9..9bdeca12ae0e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -190,6 +190,11 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) err = cn_printf(cn, "%d", task_tgid_vnr(current)); break; + /* global pid */ + case 'P': + err = cn_printf(cn, "%d", + task_tgid_nr(current)); + break; /* uid */ case 'u': err = cn_printf(cn, "%d", cred->uid); From be49b30a98fe7e20f898fcfe7b6c082700fb96e8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:24:34 -0700 Subject: [PATCH 223/303] fs/file_table.c:fput(): make comment more truthful Cc: "Eric W. Biederman" Cc: Andrey Vagin Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 322cd37626cb..abdd15ad13c9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -311,8 +311,7 @@ void fput(struct file *file) return; /* * After this task has run exit_task_work(), - * task_work_add() will fail. free_ipc_ns()-> - * shm_destroy() can do this. Fall through to delayed + * task_work_add() will fail. Fall through to delayed * fput to avoid leaking *file. */ } From 4649602265495a3bb776d777c91dba569f4afb5d Mon Sep 17 00:00:00 2001 From: Minto Joseph Date: Wed, 11 Sep 2013 14:24:35 -0700 Subject: [PATCH 224/303] Documentation/filesystems/proc.txt: fix mistake in the description of Committed_AS Fix mistake in the description of Committed_AS in kernel documentation. Signed-off-by: Minto Joseph Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fcc22c982a25..823c95faebd2 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -854,16 +854,15 @@ Committed_AS: The amount of memory presently allocated on the system. The committed memory is a sum of all of the memory which has been allocated by processes, even if it has not been "used" by them as of yet. A process which malloc()'s 1G - of memory, but only touches 300M of it will only show up - as using 300M of memory even if it has the address space - allocated for the entire 1G. This 1G is memory which has - been "committed" to by the VM and can be used at any time - by the allocating application. With strict overcommit - enabled on the system (mode 2 in 'vm.overcommit_memory'), - allocations which would exceed the CommitLimit (detailed - above) will not be permitted. This is useful if one needs - to guarantee that processes will not fail due to lack of - memory once that memory has been successfully allocated. + of memory, but only touches 300M of it will show up as + using 1G. This 1G is memory which has been "committed" to + by the VM and can be used at any time by the allocating + application. With strict overcommit enabled on the system + (mode 2 in 'vm.overcommit_memory'),allocations which would + exceed the CommitLimit (detailed above) will not be permitted. + This is useful if one needs to guarantee that processes will + not fail due to lack of memory once that memory has been + successfully allocated. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free From a3c039929d01f793c47922017b6c0ae438e11598 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 11 Sep 2013 14:24:35 -0700 Subject: [PATCH 225/303] fs/proc/task_mmu.c: check the return value of mpol_to_str() mpol_to_str() may fail, and not fill the buffer (e.g. -EINVAL), so need check about it, or buffer may not be zero based, and next seq_printf() will cause issue. The failure return need after mpol_cond_put() to match get_vma_policy(). Signed-off-by: Chen Gang Cc: Cyrill Gorcunov Cc: Mel Gorman Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 09228639b83d..7366e9d63cee 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1402,8 +1402,10 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol); + n = mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); + if (n < 0) + return n; seq_printf(m, "%08lx %s", vma->vm_start, buffer); From 96d0df79f2644fc823f26c06491e182d87a90c2a Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:37 -0700 Subject: [PATCH 226/303] proc: make proc_fd_permission() thread-friendly proc_fd_permission() says "process can still access /proc/self/fd after it has executed a setuid()", but the "task_pid() = proc_pid() check only helps if the task is group leader, /proc/self points to /proc/. Change this check to use task_tgid() so that the whole thread group can access its /proc/self/fd or /proc//fd. Notes: - CLONE_THREAD does not require CLONE_FILES so task->files can differ, but I don't think this can lead to any security problem. And this matches same_thread_group() in __ptrace_may_access(). - /proc/self should probably point to /proc/, but it is too late to change the rules. Perhaps it makes sense to add /proc/thread though. Test-case: void *tfunc(void *arg) { assert(opendir("/proc/self/fd")); return NULL; } int main(void) { pthread_t t; pthread_create(&t, NULL, tfunc, NULL); pthread_join(t, NULL); return 0; } fails if, say, this executable is not readable and suid_dumpable = 0. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/fd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 0ff80f9b930f..985ea881b5bc 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -286,7 +286,7 @@ int proc_fd_permission(struct inode *inode, int mask) int rv = generic_permission(inode, mask); if (rv == 0) return 0; - if (task_pid(current) == proc_pid(inode)) + if (task_tgid(current) == proc_pid(inode)) rv = 0; return rv; } From 5d1baf3b63bfc8c709dc44df85ff1475c7ef489d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:38 -0700 Subject: [PATCH 227/303] exec: introduce exec_binprm() for "depth == 0" code task_pid_nr_ns() and trace/ptrace code in the middle of the recursive search_binary_handler() looks confusing and imho annoying. We only need this code if "depth == 0", lets add a simple helper which calls search_binary_handler() and does trace_sched_process_exec() + ptrace_event(). The patch also moves the setting of task->did_exec, we need to do this only once. Note: we can kill either task->did_exec or PF_FORKNOEXEC. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 2d1e52a58fe9..4d95b4709ea0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1373,7 +1373,6 @@ int search_binary_handler(struct linux_binprm *bprm) unsigned int depth = bprm->recursion_depth; int try,retval; struct linux_binfmt *fmt; - pid_t old_pid, old_vpid; /* This allows 4 levels of binfmt rewrites before failing hard. */ if (depth > 5) @@ -1387,12 +1386,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (retval) return retval; - /* Need to fetch pid before load_binary changes it */ - old_pid = current->pid; - rcu_read_lock(); - old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); - rcu_read_unlock(); - retval = -ENOENT; for (try=0; try<2; try++) { read_lock(&binfmt_lock); @@ -1407,16 +1400,11 @@ int search_binary_handler(struct linux_binprm *bprm) retval = fn(bprm); bprm->recursion_depth = depth; if (retval >= 0) { - if (depth == 0) { - trace_sched_process_exec(current, old_pid, bprm); - ptrace_event(PTRACE_EVENT_EXEC, old_vpid); - } put_binfmt(fmt); allow_write_access(bprm->file); if (bprm->file) fput(bprm->file); bprm->file = NULL; - current->did_exec = 1; proc_exec_connector(current); return retval; } @@ -1450,9 +1438,29 @@ int search_binary_handler(struct linux_binprm *bprm) } return retval; } - EXPORT_SYMBOL(search_binary_handler); +static int exec_binprm(struct linux_binprm *bprm) +{ + pid_t old_pid, old_vpid; + int ret; + + /* Need to fetch pid before load_binary changes it */ + old_pid = current->pid; + rcu_read_lock(); + old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent)); + rcu_read_unlock(); + + ret = search_binary_handler(bprm); + if (ret >= 0) { + trace_sched_process_exec(current, old_pid, bprm); + ptrace_event(PTRACE_EVENT_EXEC, old_vpid); + current->did_exec = 1; + } + + return ret; +} + /* * sys_execve() executes a new program. */ @@ -1541,7 +1549,7 @@ static int do_execve_common(const char *filename, if (retval < 0) goto out; - retval = search_binary_handler(bprm); + retval = exec_binprm(bprm); if (retval < 0) goto out; From 131b2f9f1214f338f0bf7c0d9760019f2b1d0c20 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:39 -0700 Subject: [PATCH 228/303] exec: kill "int depth" in search_binary_handler() Nobody except search_binary_handler() should touch ->recursion_depth, "int depth" buys nothing but complicates the code, kill it. Probably we should also kill "fn" and the !NULL check, ->load_binary should be always defined. And it can not go away after read_unlock() or this code is buggy anyway. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 9 ++++----- include/linux/binfmts.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 4d95b4709ea0..b6e35ec818a2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1370,12 +1370,11 @@ EXPORT_SYMBOL(remove_arg_zero); */ int search_binary_handler(struct linux_binprm *bprm) { - unsigned int depth = bprm->recursion_depth; - int try,retval; + int try, retval; struct linux_binfmt *fmt; /* This allows 4 levels of binfmt rewrites before failing hard. */ - if (depth > 5) + if (bprm->recursion_depth > 5) return -ELOOP; retval = security_bprm_check(bprm); @@ -1396,9 +1395,9 @@ int search_binary_handler(struct linux_binprm *bprm) if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); - bprm->recursion_depth = depth + 1; + bprm->recursion_depth++; retval = fn(bprm); - bprm->recursion_depth = depth; + bprm->recursion_depth--; if (retval >= 0) { put_binfmt(fmt); allow_write_access(bprm->file); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 70cf138690e9..e8112ae50531 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -31,7 +31,7 @@ struct linux_binprm { #ifdef __alpha__ unsigned int taso:1; #endif - unsigned int recursion_depth; + unsigned int recursion_depth; /* only for search_binary_handler() */ struct file * file; struct cred *cred; /* new credentials */ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ From 9beb266f2d7e5362c5bb9f999255aa1af5318aef Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:40 -0700 Subject: [PATCH 229/303] exec: proc_exec_connector() should be called only once A separate one-liner with the minor fix. PROC_EVENT_EXEC reports the "exec" event, but this message is sent at least twice if search_binary_handler() is called by ->load_binary() recursively, say, load_script(). Move it to exec_binprm(), this is "depth == 0" code too. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index b6e35ec818a2..d51f7172832b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1404,7 +1404,6 @@ int search_binary_handler(struct linux_binprm *bprm) if (bprm->file) fput(bprm->file); bprm->file = NULL; - proc_exec_connector(current); return retval; } read_lock(&binfmt_lock); @@ -1455,6 +1454,7 @@ static int exec_binprm(struct linux_binprm *bprm) trace_sched_process_exec(current, old_pid, bprm); ptrace_event(PTRACE_EVENT_EXEC, old_vpid); current->did_exec = 1; + proc_exec_connector(current); } return ret; From 52f14282bb0c3d3e5ba2a9eaacb12ff37a033e7e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:41 -0700 Subject: [PATCH 230/303] exec: move allow_write_access/fput to exec_binprm() When search_binary_handler() succeeds it does allow_write_access() and fput(), then it clears bprm->file to ensure the caller will not do the same. We can simply move this code to exec_binprm() which is called only once. In fact we could move this to free_bprm() and remove the same code in do_execve_common's error path. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index d51f7172832b..a4cfd1d725e0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1400,10 +1400,6 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth--; if (retval >= 0) { put_binfmt(fmt); - allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); - bprm->file = NULL; return retval; } read_lock(&binfmt_lock); @@ -1455,6 +1451,12 @@ static int exec_binprm(struct linux_binprm *bprm) ptrace_event(PTRACE_EVENT_EXEC, old_vpid); current->did_exec = 1; proc_exec_connector(current); + + if (bprm->file) { + allow_write_access(bprm->file); + fput(bprm->file); + bprm->file = NULL; /* to catch use-after-free */ + } } return ret; From 92eaa565add62d56b90987f58ea9feafc5a7c183 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:42 -0700 Subject: [PATCH 231/303] exec: kill ->load_binary != NULL check in search_binary_handler() search_binary_handler() checks ->load_binary != NULL for no reason, this method should be always defined. Turn this check into WARN_ON() and move it into __register_binfmt(). Also, kill the function pointer. The current code looks confusing, as if ->load_binary can go away after read_unlock(&binfmt_lock). But we rely on module_get(fmt->module), this fmt can't be changed or unregistered, otherwise this code is buggy anyway. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index a4cfd1d725e0..7b92fbfa63aa 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -74,6 +74,8 @@ static DEFINE_RWLOCK(binfmt_lock); void __register_binfmt(struct linux_binfmt * fmt, int insert) { BUG_ON(!fmt); + if (WARN_ON(!fmt->load_binary)) + return; write_lock(&binfmt_lock); insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); @@ -1389,14 +1391,11 @@ int search_binary_handler(struct linux_binprm *bprm) for (try=0; try<2; try++) { read_lock(&binfmt_lock); list_for_each_entry(fmt, &formats, lh) { - int (*fn)(struct linux_binprm *) = fmt->load_binary; - if (!fn) - continue; if (!try_module_get(fmt->module)) continue; read_unlock(&binfmt_lock); bprm->recursion_depth++; - retval = fn(bprm); + retval = fmt->load_binary(bprm); bprm->recursion_depth--; if (retval >= 0) { put_binfmt(fmt); From cb7b6b1cbc20a970c7124efae1c2478155604b54 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:44 -0700 Subject: [PATCH 232/303] exec: cleanup the CONFIG_MODULES logic search_binary_handler() uses "for (try=0; try<2; try++)" to avoid "goto" but the code looks too complicated and horrible imho. We still need to check "try == 0" before request_module() and add the additional "break" for !CONFIG_MODULES case. Kill this loop and use a simple "bool need_retry" + "goto retry". The code looks much simpler and we do not even need ifdef's, gcc can optimize out the "if (need_retry)" block if !IS_ENABLED(). Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 72 +++++++++++++++++++++++++------------------------------ 1 file changed, 33 insertions(+), 39 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 7b92fbfa63aa..ba357e6aea98 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1367,13 +1367,15 @@ out: } EXPORT_SYMBOL(remove_arg_zero); +#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) /* * cycle the list of binary formats handler, until one recognizes the image */ int search_binary_handler(struct linux_binprm *bprm) { - int try, retval; + bool need_retry = IS_ENABLED(CONFIG_MODULES); struct linux_binfmt *fmt; + int retval; /* This allows 4 levels of binfmt rewrites before failing hard. */ if (bprm->recursion_depth > 5) @@ -1388,47 +1390,39 @@ int search_binary_handler(struct linux_binprm *bprm) return retval; retval = -ENOENT; - for (try=0; try<2; try++) { - read_lock(&binfmt_lock); - list_for_each_entry(fmt, &formats, lh) { - if (!try_module_get(fmt->module)) - continue; - read_unlock(&binfmt_lock); - bprm->recursion_depth++; - retval = fmt->load_binary(bprm); - bprm->recursion_depth--; - if (retval >= 0) { - put_binfmt(fmt); - return retval; - } - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (retval != -ENOEXEC || bprm->mm == NULL) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } - } + retry: + read_lock(&binfmt_lock); + list_for_each_entry(fmt, &formats, lh) { + if (!try_module_get(fmt->module)) + continue; read_unlock(&binfmt_lock); -#ifdef CONFIG_MODULES - if (retval != -ENOEXEC || bprm->mm == NULL) { - break; - } else { -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) - if (printable(bprm->buf[0]) && - printable(bprm->buf[1]) && - printable(bprm->buf[2]) && - printable(bprm->buf[3])) - break; /* -ENOEXEC */ - if (try) - break; /* -ENOEXEC */ - request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); + bprm->recursion_depth++; + retval = fmt->load_binary(bprm); + bprm->recursion_depth--; + if (retval >= 0) { + put_binfmt(fmt); + return retval; + } + read_lock(&binfmt_lock); + put_binfmt(fmt); + if (retval != -ENOEXEC || bprm->mm == NULL) + break; + if (!bprm->file) { + read_unlock(&binfmt_lock); + return retval; } -#else - break; -#endif } + read_unlock(&binfmt_lock); + + if (need_retry && retval == -ENOEXEC && bprm->mm) { + if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && + printable(bprm->buf[2]) && printable(bprm->buf[3])) + return retval; + request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)); + need_retry = false; + goto retry; + } + return retval; } EXPORT_SYMBOL(search_binary_handler); From 4e0621a07ea58a0dc15859be3b743bdeb194a51b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:45 -0700 Subject: [PATCH 233/303] exec: don't retry if request_module() fails A separate one-liner for better documentation. It doesn't make sense to retry if request_module() fails to exec /sbin/modprobe, add the additional "request_module() < 0" check. However, this logic still doesn't look exactly right: 1. It would be better to check "request_module() != 0", the user space modprobe process should report the correct exit code. But I didn't dare to add the user-visible change. 2. The whole ENOEXEC logic looks suboptimal. Suppose that we try to exec a "#!path-to-unsupported-binary" script. In this case request_module() + "retry" will be done twice: first by the "depth == 1" code, and then again by the "depth == 0" caller which doesn't make sense. 3. And note that in the case above bprm->buf was already changed by load_script()->prepare_binprm(), so this looks even more ugly. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index ba357e6aea98..635b586de336 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1418,7 +1418,8 @@ int search_binary_handler(struct linux_binprm *bprm) if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; - request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)); + if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0) + return retval; need_retry = false; goto retry; } From 6b3c538f5b2cfc53cb6803ec5001bbcf8f18a98e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:24:46 -0700 Subject: [PATCH 234/303] exec: cleanup the error handling in search_binary_handler() The error hanling and ret-from-loop look confusing and inconsistent. - "retval >= 0" simply returns - "!bprm->file" returns too but with read_unlock() because binfmt_lock was already re-acquired - "retval != -ENOEXEC || bprm->mm == NULL" does "break" and relies on the same check after the main loop Consolidate these checks into a single if/return statement. need_retry still checks "retval == -ENOEXEC", but this and -ENOENT before the main loop are not needed. This is only for pathological and impossible list_empty(&formats) case. It is not clear why do we check "bprm->mm == NULL", probably this should be removed. Signed-off-by: Oleg Nesterov Acked-by: Kees Cook Cc: Al Viro Cc: Evgeniy Polyakov Cc: Zach Levis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 635b586de336..8875dd10ae7a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1399,22 +1399,17 @@ int search_binary_handler(struct linux_binprm *bprm) bprm->recursion_depth++; retval = fmt->load_binary(bprm); bprm->recursion_depth--; - if (retval >= 0) { + if (retval >= 0 || retval != -ENOEXEC || + bprm->mm == NULL || bprm->file == NULL) { put_binfmt(fmt); return retval; } read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval != -ENOEXEC || bprm->mm == NULL) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } } read_unlock(&binfmt_lock); - if (need_retry && retval == -ENOEXEC && bprm->mm) { + if (need_retry && retval == -ENOEXEC) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; From 80c74f6a40284c5c5d49f3b3289172bbce0b30b8 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:24:47 -0700 Subject: [PATCH 235/303] kexec: remove unnecessary return Code can not run here forever, so remove the unnecessary return. Signed-off-by: Xishi Qiu Suggested-by: Zhang Yanfei Reviewed-by: Simon Horman Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f7b55ba745..2a74f307c5ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline, if (first_colon && (!first_space || first_colon < first_space)) return parse_crashkernel_mem(ck_cmdline, system_ram, crash_size, crash_base); - else - return parse_crashkernel_simple(ck_cmdline, crash_size, - crash_base); - return 0; + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); } /* From be8a8d069e508d4408125e2b1471f549e7813d25 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:49 -0700 Subject: [PATCH 236/303] vmcore: introduce ELF header in new memory feature For s390 we want to use /proc/vmcore for our SCSI stand-alone dump (zfcpdump). We have support where the first HSA_SIZE bytes are saved into a hypervisor owned memory area (HSA) before the kdump kernel is booted. When the kdump kernel starts, it is restricted to use only HSA_SIZE bytes. The advantages of this mechanism are: * No crashkernel memory has to be defined in the old kernel. * Early boot problems (before kexec_load has been done) can be dumped * Non-Linux systems can be dumped. We modify the s390 copy_oldmem_page() function to read from the HSA memory if memory below HSA_SIZE bytes is requested. Since we cannot use the kexec tool to load the kernel in this scenario, we have to build the ELF header in the 2nd (kdump/new) kernel. So with the following patch set we would like to introduce the new function that the ELF header for /proc/vmcore can be created in the 2nd kernel memory. The following steps are done during zfcpdump execution: 1. Production system crashes 2. User boots a SCSI disk that has been prepared with the zfcpdump tool 3. Hypervisor saves CPU state of boot CPU and HSA_SIZE bytes of memory into HSA 4. Boot loader loads kernel into low memory area 5. Kernel boots and uses only HSA_SIZE bytes of memory 6. Kernel saves registers of non-boot CPUs 7. Kernel does memory detection for dump memory map 8. Kernel creates ELF header for /proc/vmcore 9. /proc/vmcore uses this header for initialization 10. The zfcpdump user space reads /proc/vmcore to write dump to SCSI disk - copy_oldmem_page() copies from HSA for memory below HSA_SIZE - copy_oldmem_page() copies from real memory for memory above HSA_SIZE Currently for s390 we create the ELF core header in the 2nd kernel with a small trick. We relocate the addresses in the ELF header in a way that for the /proc/vmcore code it seems to be in the 1st kernel (old) memory and the read_from_oldmem() returns the correct data. This allows the /proc/vmcore code to use the ELF header in the 2nd kernel. This patch: Exchange the old mechanism with the new and much cleaner function call override feature that now offcially allows to create the ELF core header in the 2nd kernel. To use the new feature the following function have to be defined by the architecture backend code to read from new memory: * elfcorehdr_alloc: Allocate ELF header * elfcorehdr_free: Free the memory of the ELF header * elfcorehdr_read: Read from ELF header * elfcorehdr_read_notes: Read from ELF notes Signed-off-by: Michael Holzheu Acked-by: Vivek Goyal Cc: HATAYAMA Daisuke Cc: Jan Willeke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 61 +++++++++++++++++++++++++++++++------- include/linux/crash_dump.h | 6 ++++ 2 files changed, 57 insertions(+), 10 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index a1a16eb97c7b..02cb3ff108bc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -123,6 +123,36 @@ static ssize_t read_from_oldmem(char *buf, size_t count, return read; } +/* + * Architectures may override this function to allocate ELF header in 2nd kernel + */ +int __weak elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) +{ + return 0; +} + +/* + * Architectures may override this function to free header + */ +void __weak elfcorehdr_free(unsigned long long addr) +{} + +/* + * Architectures may override this function to read from ELF header + */ +ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + +/* + * Architectures may override this function to read from notes sections + */ +ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + return read_from_oldmem(buf, count, ppos, 0); +} + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ @@ -357,7 +387,7 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr) notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); if (rc < 0) { kfree(notes_section); return rc; @@ -444,7 +474,8 @@ static int __init copy_notes_elf64(const Elf64_Ehdr *ehdr_ptr, char *notes_buf) if (phdr_ptr->p_type != PT_NOTE) continue; offset = phdr_ptr->p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); if (rc < 0) return rc; notes_buf += phdr_ptr->p_memsz; @@ -536,7 +567,7 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr) notes_section = kmalloc(max_sz, GFP_KERNEL); if (!notes_section) return -ENOMEM; - rc = read_from_oldmem(notes_section, max_sz, &offset, 0); + rc = elfcorehdr_read_notes(notes_section, max_sz, &offset); if (rc < 0) { kfree(notes_section); return rc; @@ -623,7 +654,8 @@ static int __init copy_notes_elf32(const Elf32_Ehdr *ehdr_ptr, char *notes_buf) if (phdr_ptr->p_type != PT_NOTE) continue; offset = phdr_ptr->p_offset; - rc = read_from_oldmem(notes_buf, phdr_ptr->p_memsz, &offset, 0); + rc = elfcorehdr_read_notes(notes_buf, phdr_ptr->p_memsz, + &offset); if (rc < 0) return rc; notes_buf += phdr_ptr->p_memsz; @@ -810,7 +842,7 @@ static int __init parse_crash_elf64_headers(void) addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)&ehdr, sizeof(Elf64_Ehdr), &addr, 0); + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf64_Ehdr), &addr); if (rc < 0) return rc; @@ -837,7 +869,7 @@ static int __init parse_crash_elf64_headers(void) if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); if (rc < 0) goto fail; @@ -866,7 +898,7 @@ static int __init parse_crash_elf32_headers(void) addr = elfcorehdr_addr; /* Read Elf header */ - rc = read_from_oldmem((char*)&ehdr, sizeof(Elf32_Ehdr), &addr, 0); + rc = elfcorehdr_read((char *)&ehdr, sizeof(Elf32_Ehdr), &addr); if (rc < 0) return rc; @@ -892,7 +924,7 @@ static int __init parse_crash_elf32_headers(void) if (!elfcorebuf) return -ENOMEM; addr = elfcorehdr_addr; - rc = read_from_oldmem(elfcorebuf, elfcorebuf_sz_orig, &addr, 0); + rc = elfcorehdr_read(elfcorebuf, elfcorebuf_sz_orig, &addr); if (rc < 0) goto fail; @@ -919,7 +951,7 @@ static int __init parse_crash_elf_headers(void) int rc=0; addr = elfcorehdr_addr; - rc = read_from_oldmem(e_ident, EI_NIDENT, &addr, 0); + rc = elfcorehdr_read(e_ident, EI_NIDENT, &addr); if (rc < 0) return rc; if (memcmp(e_ident, ELFMAG, SELFMAG) != 0) { @@ -952,7 +984,14 @@ static int __init vmcore_init(void) { int rc = 0; - /* If elfcorehdr= has been passed in cmdline, then capture the dump.*/ + /* Allow architectures to allocate ELF header in 2nd kernel */ + rc = elfcorehdr_alloc(&elfcorehdr_addr, &elfcorehdr_size); + if (rc) + return rc; + /* + * If elfcorehdr= has been passed in cmdline or created in 2nd kernel, + * then capture the dump. + */ if (!(is_vmcore_usable())) return rc; rc = parse_crash_elf_headers(); @@ -960,6 +999,8 @@ static int __init vmcore_init(void) pr_warn("Kdump: vmcore not initialized\n"); return rc; } + elfcorehdr_free(elfcorehdr_addr); + elfcorehdr_addr = ELFCORE_ADDR_ERR; proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); if (proc_vmcore) diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 37e4f8da7cdf..6571f828e313 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -12,6 +12,12 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; +extern int __weak elfcorehdr_alloc(unsigned long long *addr, + unsigned long long *size); +extern void __weak elfcorehdr_free(unsigned long long addr); +extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); + extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); From 97b0f6f9cd73ff8285835c5e295d3c4b0e2dbf78 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:50 -0700 Subject: [PATCH 237/303] s390/vmcore: use ELF header in new memory feature Exchange the old relocate mechanism with the new arch function call override mechanism that allows to create the ELF core header in the 2nd kernel. Signed-off-by: Michael Holzheu Cc: HATAYAMA Daisuke Cc: Jan Willeke Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/crash_dump.c | 89 +++++++++++++++++++++++------------ 1 file changed, 58 insertions(+), 31 deletions(-) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index d8f355657171..0c9a897a1fb5 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -63,6 +63,11 @@ static ssize_t copy_page_real(void *buf, void *src, size_t csize) } } +/* + * Pointer to ELF header in new kernel + */ +static void *elfcorehdr_newmem; + /* * Copy one page from "oldmem" * @@ -367,14 +372,6 @@ static int get_mem_chunk_cnt(void) return cnt; } -/* - * Relocate pointer in order to allow vmcore code access the data - */ -static inline unsigned long relocate(unsigned long addr) -{ - return OLDMEM_BASE + addr; -} - /* * Initialize ELF loads (new kernel) */ @@ -426,7 +423,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) ptr = nt_vmcoreinfo(ptr); memset(phdr, 0, sizeof(*phdr)); phdr->p_type = PT_NOTE; - phdr->p_offset = relocate(notes_offset); + phdr->p_offset = notes_offset; phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); phdr->p_memsz = phdr->p_filesz; return ptr; @@ -435,7 +432,7 @@ static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) /* * Create ELF core header (new kernel) */ -static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) +int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) { Elf64_Phdr *phdr_notes, *phdr_loads; int mem_chunk_cnt; @@ -443,6 +440,11 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) u32 alloc_size; u64 hdr_off; + if (!OLDMEM_BASE) + return 0; + /* If elfcorehdr= has been passed via cmdline, we use that one */ + if (elfcorehdr_addr != ELFCORE_ADDR_MAX) + return 0; mem_chunk_cnt = get_mem_chunk_cnt(); alloc_size = 0x1000 + get_cpu_cnt() * 0x300 + @@ -460,27 +462,52 @@ static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); /* Init loads */ hdr_off = PTR_DIFF(ptr, hdr); - loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); - *elfcorebuf_sz = hdr_off; - *elfcorebuf = (void *) relocate((unsigned long) hdr); - BUG_ON(*elfcorebuf_sz > alloc_size); -} - -/* - * Create kdump ELF core header in new kernel, if it has not been passed via - * the "elfcorehdr" kernel parameter - */ -static int setup_kdump_elfcorehdr(void) -{ - size_t elfcorebuf_sz; - char *elfcorebuf; - - if (!OLDMEM_BASE || is_kdump_kernel()) - return -EINVAL; - s390_elf_corehdr_create(&elfcorebuf, &elfcorebuf_sz); - elfcorehdr_addr = (unsigned long long) elfcorebuf; - elfcorehdr_size = elfcorebuf_sz; + loads_init(phdr_loads, hdr_off); + *addr = (unsigned long long) hdr; + elfcorehdr_newmem = hdr; + *size = (unsigned long long) hdr_off; + BUG_ON(elfcorehdr_size > alloc_size); return 0; } -subsys_initcall(setup_kdump_elfcorehdr); +/* + * Free ELF core header (new kernel) + */ +void elfcorehdr_free(unsigned long long addr) +{ + if (!elfcorehdr_newmem) + return; + kfree((void *)(unsigned long)addr); +} + +/* + * Read from ELF header + */ +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + + src = elfcorehdr_newmem ? src : src - OLDMEM_BASE; + memcpy(buf, src, count); + *ppos += count; + return count; +} + +/* + * Read from ELF notes data + */ +ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) +{ + void *src = (void *)(unsigned long)*ppos; + int rc; + + if (elfcorehdr_newmem) { + memcpy(buf, src, count); + } else { + rc = copy_from_oldmem(buf, src, count); + if (rc) + return rc; + } + *ppos += count; + return count; +} From 9cb218131de1c59dca9063b2efe876f053f316af Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:51 -0700 Subject: [PATCH 238/303] vmcore: introduce remap_oldmem_pfn_range() For zfcpdump we can't map the HSA storage because it is only available via a read interface. Therefore, for the new vmcore mmap feature we have introduce a new mechanism to create mappings on demand. This patch introduces a new architecture function remap_oldmem_pfn_range() that should be used to create mappings with remap_pfn_range() for oldmem areas that can be directly mapped. For zfcpdump this is everything besides of the HSA memory. For the areas that are not mapped by remap_oldmem_pfn_range() a generic vmcore a new generic vmcore fault handler mmap_vmcore_fault() is called. This handler works as follows: * Get already available or new page from page cache (find_or_create_page) * Check if /proc/vmcore page is filled with data (PageUptodate) * If yes: Return that page * If no: Fill page using __vmcore_read(), set PageUptodate, and return page Signed-off-by: Michael Holzheu Acked-by: Vivek Goyal Cc: HATAYAMA Daisuke Cc: Jan Willeke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 91 ++++++++++++++++++++++++++++++++++---- include/linux/crash_dump.h | 3 ++ 2 files changed, 86 insertions(+), 8 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 02cb3ff108bc..d07b70a6eed5 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -153,11 +154,35 @@ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) return read_from_oldmem(buf, count, ppos, 0); } +/* + * Architectures may override this function to map oldmem + */ +int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Copy to either kernel or user space + */ +static int copy_to(void *target, void *src, size_t size, int userbuf) +{ + if (userbuf) { + if (copy_to_user((char __user *) target, src, size)) + return -EFAULT; + } else { + memcpy(target, src, size); + } + return 0; +} + /* Read from the ELF header and then the crash dump. On error, negative value is * returned otherwise number of bytes read are returned. */ -static ssize_t read_vmcore(struct file *file, char __user *buffer, - size_t buflen, loff_t *fpos) +static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, + int userbuf) { ssize_t acc = 0, tmp; size_t tsz; @@ -174,7 +199,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, /* Read ELF core header */ if (*fpos < elfcorebuf_sz) { tsz = min(elfcorebuf_sz - (size_t)*fpos, buflen); - if (copy_to_user(buffer, elfcorebuf + *fpos, tsz)) + if (copy_to(buffer, elfcorebuf + *fpos, tsz, userbuf)) return -EFAULT; buflen -= tsz; *fpos += tsz; @@ -192,7 +217,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)*fpos, buflen); kaddr = elfnotes_buf + *fpos - elfcorebuf_sz; - if (copy_to_user(buffer, kaddr, tsz)) + if (copy_to(buffer, kaddr, tsz, userbuf)) return -EFAULT; buflen -= tsz; *fpos += tsz; @@ -208,7 +233,7 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, if (*fpos < m->offset + m->size) { tsz = min_t(size_t, m->offset + m->size - *fpos, buflen); start = m->paddr + *fpos - m->offset; - tmp = read_from_oldmem(buffer, tsz, &start, 1); + tmp = read_from_oldmem(buffer, tsz, &start, userbuf); if (tmp < 0) return tmp; buflen -= tsz; @@ -225,6 +250,55 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } +static ssize_t read_vmcore(struct file *file, char __user *buffer, + size_t buflen, loff_t *fpos) +{ + return __read_vmcore((__force char *) buffer, buflen, fpos, 1); +} + +/* + * The vmcore fault handler uses the page cache and fills data using the + * standard __vmcore_read() function. + * + * On s390 the fault handler is used for memory regions that can't be mapped + * directly with remap_pfn_range(). + */ +static int mmap_vmcore_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +#ifdef CONFIG_S390 + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t index = vmf->pgoff; + struct page *page; + loff_t offset; + char *buf; + int rc; + + page = find_or_create_page(mapping, index, GFP_KERNEL); + if (!page) + return VM_FAULT_OOM; + if (!PageUptodate(page)) { + offset = (loff_t) index << PAGE_CACHE_SHIFT; + buf = __va((page_to_pfn(page) << PAGE_SHIFT)); + rc = __read_vmcore(buf, PAGE_SIZE, &offset, 0); + if (rc < 0) { + unlock_page(page); + page_cache_release(page); + return (rc == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + } + SetPageUptodate(page); + } + unlock_page(page); + vmf->page = page; + return 0; +#else + return VM_FAULT_SIGBUS; +#endif +} + +static const struct vm_operations_struct vmcore_mmap_ops = { + .fault = mmap_vmcore_fault, +}; + /** * alloc_elfnotes_buf - allocate buffer for ELF note segment in * vmalloc memory @@ -271,6 +345,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) vma->vm_flags &= ~(VM_MAYWRITE | VM_MAYEXEC); vma->vm_flags |= VM_MIXEDMAP; + vma->vm_ops = &vmcore_mmap_ops; len = 0; @@ -312,9 +387,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min_t(size_t, m->offset + m->size - start, size); paddr = m->paddr + start - m->offset; - if (remap_pfn_range(vma, vma->vm_start + len, - paddr >> PAGE_SHIFT, tsz, - vma->vm_page_prot)) + if (remap_oldmem_pfn_range(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) goto fail; size -= tsz; start += tsz; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 6571f828e313..fe68a5a98583 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -17,6 +17,9 @@ extern int __weak elfcorehdr_alloc(unsigned long long *addr, extern void __weak elfcorehdr_free(unsigned long long addr); extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); From 23df79da8eb97757e39af7625665c1c5cecc610b Mon Sep 17 00:00:00 2001 From: Jan Willeke Date: Wed, 11 Sep 2013 14:24:52 -0700 Subject: [PATCH 239/303] s390/vmcore: implement remap_oldmem_pfn_range for s390 Introduce the s390 specific way to map pages from oldmem. The memory area below OLDMEM_SIZE is mapped with offset OLDMEM_BASE. The other old memory is mapped directly. Signed-off-by: Jan Willeke Signed-off-by: Michael Holzheu Cc: HATAYAMA Daisuke Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/crash_dump.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 0c9a897a1fb5..3e776158b330 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -98,6 +98,32 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return (rc == 0) ? csize : rc; } +/* + * Remap "oldmem" + * + * For the kdump reserved memory this functions performs a swap operation: + * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] + */ +int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + unsigned long size_old; + int rc; + + if (pfn < OLDMEM_SIZE >> PAGE_SHIFT) { + size_old = min(size, OLDMEM_SIZE - (pfn << PAGE_SHIFT)); + rc = remap_pfn_range(vma, from, + pfn + (OLDMEM_BASE >> PAGE_SHIFT), + size_old, prot); + if (rc || size == size_old) + return rc; + size -= size_old; + from += size_old; + pfn += size_old >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + /* * Copy memory from old kernel */ From 11e376a3f9ffa85bf444b65df5326612b083c501 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:53 -0700 Subject: [PATCH 240/303] vmcore: enable /proc/vmcore mmap for s390 The patch "s390/vmcore: Implement remap_oldmem_pfn_range for s390" allows now to use mmap also on s390. So enable mmap for s390 again. Signed-off-by: Michael Holzheu Cc: HATAYAMA Daisuke Cc: Jan Willeke Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index d07b70a6eed5..9100d6959886 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -327,7 +327,7 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz) * regions in the 1st kernel pointed to by PT_LOAD entries) into * virtually contiguous user-space in ELF layout. */ -#if defined(CONFIG_MMU) && !defined(CONFIG_S390) +#ifdef CONFIG_MMU static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; From 6f79d33228fa7cf900826738a39f287cae96cd91 Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 11 Sep 2013 14:24:54 -0700 Subject: [PATCH 241/303] s390/vmcore: use vmcore for zfcpdump Modify the s390 copy_oldmem_page() and remap_oldmem_pfn_range() function for zfcpdump to read from the HSA memory if memory below HSA_SIZE bytes is requested. Otherwise real memory is used. Signed-off-by: Michael Holzheu Cc: HATAYAMA Daisuke Cc: Jan Willeke Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/Kconfig | 3 +- arch/s390/include/asm/sclp.h | 1 + arch/s390/kernel/crash_dump.c | 122 +++++++++++++++++++++++++++++----- drivers/s390/char/zcore.c | 6 +- 4 files changed, 110 insertions(+), 22 deletions(-) diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index fb2723e8ba65..3ec272859e1e 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -526,6 +526,7 @@ config CRASH_DUMP bool "kernel crash dumps" depends on 64BIT && SMP select KEXEC + select ZFCPDUMP help Generate crash dump after being started by kexec. Crash dump kernels are loaded in the main kernel with kexec-tools @@ -536,7 +537,7 @@ config CRASH_DUMP config ZFCPDUMP def_bool n prompt "zfcpdump support" - select SMP + depends on SMP help Select this option if you want to build an zfcpdump enabled kernel. Refer to for more details on this. diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 06a136136047..7dc7f9c63b65 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -56,5 +56,6 @@ bool sclp_has_linemode(void); bool sclp_has_vt220(void); int sclp_pci_configure(u32 fid); int sclp_pci_deconfigure(u32 fid); +int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 3e776158b330..c84f33d51f7b 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -16,6 +16,7 @@ #include #include #include +#include #define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) #define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) @@ -69,22 +70,41 @@ static ssize_t copy_page_real(void *buf, void *src, size_t csize) static void *elfcorehdr_newmem; /* - * Copy one page from "oldmem" + * Copy one page from zfcpdump "oldmem" + * + * For pages below ZFCPDUMP_HSA_SIZE memory from the HSA is copied. Otherwise + * real memory copy is used. + */ +static ssize_t copy_oldmem_page_zfcpdump(char *buf, size_t csize, + unsigned long src, int userbuf) +{ + int rc; + + if (src < ZFCPDUMP_HSA_SIZE) { + rc = memcpy_hsa(buf, src, csize, userbuf); + } else { + if (userbuf) + rc = copy_to_user_real((void __force __user *) buf, + (void *) src, csize); + else + rc = memcpy_real(buf, (void *) src, csize); + } + return rc ? rc : csize; +} + +/* + * Copy one page from kdump "oldmem" * * For the kdump reserved memory this functions performs a swap operation: * - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE]. * - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +static ssize_t copy_oldmem_page_kdump(char *buf, size_t csize, + unsigned long src, int userbuf) + { - unsigned long src; int rc; - if (!csize) - return 0; - - src = (pfn << PAGE_SHIFT) + offset; if (src < OLDMEM_SIZE) src += OLDMEM_BASE; else if (src > OLDMEM_BASE && @@ -95,17 +115,35 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, (void *) src, csize); else rc = copy_page_real(buf, (void *) src, csize); - return (rc == 0) ? csize : rc; + return (rc == 0) ? rc : csize; } /* - * Remap "oldmem" + * Copy one page from "oldmem" + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) +{ + unsigned long src; + + if (!csize) + return 0; + src = (pfn << PAGE_SHIFT) + offset; + if (OLDMEM_BASE) + return copy_oldmem_page_kdump(buf, csize, src, userbuf); + else + return copy_oldmem_page_zfcpdump(buf, csize, src, userbuf); +} + +/* + * Remap "oldmem" for kdump * * For the kdump reserved memory this functions performs a swap operation: * [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] */ -int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, - unsigned long pfn, unsigned long size, pgprot_t prot) +static int remap_oldmem_pfn_range_kdump(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) { unsigned long size_old; int rc; @@ -124,6 +162,43 @@ int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, return remap_pfn_range(vma, from, pfn, size, prot); } +/* + * Remap "oldmem" for zfcpdump + * + * We only map available memory above ZFCPDUMP_HSA_SIZE. Memory below + * ZFCPDUMP_HSA_SIZE is read on demand using the copy_oldmem_page() function. + */ +static int remap_oldmem_pfn_range_zfcpdump(struct vm_area_struct *vma, + unsigned long from, + unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + unsigned long size_hsa; + + if (pfn < ZFCPDUMP_HSA_SIZE >> PAGE_SHIFT) { + size_hsa = min(size, ZFCPDUMP_HSA_SIZE - (pfn << PAGE_SHIFT)); + if (size == size_hsa) + return 0; + size -= size_hsa; + from += size_hsa; + pfn += size_hsa >> PAGE_SHIFT; + } + return remap_pfn_range(vma, from, pfn, size, prot); +} + +/* + * Remap "oldmem" for kdump or zfcpdump + */ +int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + if (OLDMEM_BASE) + return remap_oldmem_pfn_range_kdump(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range_zfcpdump(vma, from, pfn, size, + prot); +} + /* * Copy memory from old kernel */ @@ -132,11 +207,21 @@ int copy_from_oldmem(void *dest, void *src, size_t count) unsigned long copied = 0; int rc; - if ((unsigned long) src < OLDMEM_SIZE) { - copied = min(count, OLDMEM_SIZE - (unsigned long) src); - rc = memcpy_real(dest, src + OLDMEM_BASE, copied); - if (rc) - return rc; + if (OLDMEM_BASE) { + if ((unsigned long) src < OLDMEM_SIZE) { + copied = min(count, OLDMEM_SIZE - (unsigned long) src); + rc = memcpy_real(dest, src + OLDMEM_BASE, copied); + if (rc) + return rc; + } + } else { + if ((unsigned long) src < ZFCPDUMP_HSA_SIZE) { + copied = min(count, + ZFCPDUMP_HSA_SIZE - (unsigned long) src); + rc = memcpy_hsa(dest, (unsigned long) src, copied, 0); + if (rc) + return rc; + } } return memcpy_real(dest + copied, src + copied, count - copied); } @@ -466,7 +551,8 @@ int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size) u32 alloc_size; u64 hdr_off; - if (!OLDMEM_BASE) + /* If we are not in kdump or zfcpdump mode return */ + if (!OLDMEM_BASE && ipl_info.type != IPL_TYPE_FCP_DUMP) return 0; /* If elfcorehdr= has been passed via cmdline, we use that one */ if (elfcorehdr_addr != ELFCORE_ADDR_MAX) diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index 9e5e14686e75..794820a123d0 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -30,8 +30,8 @@ #define TRACE(x...) debug_sprintf_event(zcore_dbf, 1, x) -#define TO_USER 0 -#define TO_KERNEL 1 +#define TO_USER 1 +#define TO_KERNEL 0 #define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */ enum arch_id { @@ -73,7 +73,7 @@ static struct ipl_parameter_block *ipl_block; * @count: Size of buffer, which should be copied * @mode: Either TO_KERNEL or TO_USER */ -static int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode) +int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode) { int offs, blk_num; static char buf[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE))); From c2ebdc2439f50c049fd362bb225aaf78fe8e4cb8 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:55 -0700 Subject: [PATCH 242/303] partitions/efi: use lba-aware partition records The kernel's GPT implementation currently uses the generic 'struct partition' type for dealing with legacy MBR partition records. While this is is useful for disklabels that we designed for CHS addressing, such as msdos, it doesn't adapt well to newer standards that use LBA instead, such as GUID partition tables. Furthermore, these generic partition structures do not have all the required fields to properly follow the UEFI specs. While a CHS address can be translated to LBA, it's much simpler and cleaner to just replace the partition type. This patch adds a new 'gpt_record' type that is fully compliant with EFI and will allow, in the next patches, to add more checks to properly verify a protective MBR, which is paramount to probing a device that makes use of GPT. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 9 ++++----- block/partitions/efi.h | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index c85fc895ecdb..bd8fb22b2109 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -149,12 +149,11 @@ static u64 last_lba(struct block_device *bdev) bdev_logical_block_size(bdev)) - 1ULL; } -static inline int -pmbr_part_valid(struct partition *part) +static inline int pmbr_part_valid(gpt_mbr_record *part) { - if (part->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sect) == 1UL) - return 1; + if (part->os_type == EFI_PMBR_OSTYPE_EFI_GPT && + le32_to_cpu(part->start_sector) == 1UL) + return 1; return 0; } diff --git a/block/partitions/efi.h b/block/partitions/efi.h index b69ab729558f..e645ecb35bf3 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -101,11 +101,25 @@ typedef struct _gpt_entry { efi_char16_t partition_name[72 / sizeof (efi_char16_t)]; } __attribute__ ((packed)) gpt_entry; +typedef struct _gpt_mbr_record { + u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */ + u8 start_head; /* unused by EFI, pt start in CHS */ + u8 start_sector; /* unused by EFI, pt start in CHS */ + u8 start_track; + u8 os_type; /* EFI and legacy non-EFI OS types */ + u8 end_head; /* unused by EFI, pt end in CHS */ + u8 end_sector; /* unused by EFI, pt end in CHS */ + u8 end_track; /* unused by EFI, pt end in CHS */ + __le32 starting_lba; /* used by EFI - start addr of the on disk pt */ + __le32 size_in_lba; /* used by EFI - size of pt in LBA */ +} __packed gpt_mbr_record; + + typedef struct _legacy_mbr { u8 boot_code[440]; __le32 unique_mbr_signature; __le16 unknown; - struct partition partition_record[4]; + gpt_mbr_record partition_record[4]; __le16 signature; } __attribute__ ((packed)) legacy_mbr; From 33afd7a7df1a1f82675857a75572cdf4a8599e9f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:56 -0700 Subject: [PATCH 243/303] partitions/efi: check pmbr record's starting lba Per the UEFI Specs 2.4, June 2013, the starting lba of the partition that has the EFI GPT (0xEE) must be set to 0x00000001 - this is obviously the LBA of the GPT Partition Header. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index bd8fb22b2109..7a2b74f0d06f 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -151,10 +151,19 @@ static u64 last_lba(struct block_device *bdev) static inline int pmbr_part_valid(gpt_mbr_record *part) { - if (part->os_type == EFI_PMBR_OSTYPE_EFI_GPT && - le32_to_cpu(part->start_sector) == 1UL) - return 1; - return 0; + if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) + goto invalid; + + /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ + if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) + goto invalid; + + if (le32_to_cpu(part->start_sector) != 1UL) + goto invalid; + + return 1; +invalid: + return 0; } /** From 3e69ac344007bec5e3987ac86619e140fbc79b72 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:57 -0700 Subject: [PATCH 244/303] partitions/efi: do not require gpt partition to begin at sector 1 When detecting a valid protective MBR, the Linux kernel isn't picky about the partition (1-4) the 0xEE is at, but, unlike other operating systems, it does require it to begin at the second sector (sector 1). This check, apart from it not being enforced by UEFI, and causing Linux to potentially fail to detect any *valid* partitions on the disk, can present problems when dealing with hybrid MBRs[1]. For compatibility reasons, if the first partition is hybridized, the 0xEE partition must be small enough to ensure that it only protects the GPT data structures - as opposed to the the whole disk in a protective MBR. This problem is very well described by Rod Smith[1]: where MBR-only partitioning programs (such as older versions of fdisk) can see some of the disk space as unallocated, thus loosing the purpose of the 0xEE partition's protection of GPT data structures. By dropping this check, this patch enables Linux to be more flexible when probing for GPT disklabels. [1] http://www.rodsbooks.com/gdisk/hybrid.html#reactions Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7a2b74f0d06f..1b499dc8fc78 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -158,9 +158,6 @@ static inline int pmbr_part_valid(gpt_mbr_record *part) if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; - if (le32_to_cpu(part->start_sector) != 1UL) - goto invalid; - return 1; invalid: return 0; From b05ebbbbeb67a420d06567c6b9618a9e644d6104 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:24:58 -0700 Subject: [PATCH 245/303] partitions/efi: detect hybrid MBRs One of the biggest problems with GPT is compatibility with older, non-GPT systems. The problem is addressed by creating hybrid mbrs, an extension, or variant, of the traditional protective mbr. This contains, apart from the 0xEE partition, up three additional primary partitions that point to the same space marked by up to three GPT partitions. The result is that legacy OSs can see the three required MBR partitions and at the same time ignore the GPT-aware partitions that protect the GPT structures. While hybrid MBRs are hacks, workarounds and simply not part of the GPT standard, they do exist and we have no way around them. For instance, by default, OSX creates a hybrid scheme when using multi-OS booting. In order for Linux to properly discover protective MBRs, it must be made aware of devices that have hybrid MBRs. No functionality is changed by this patch, just a debug message informing the user of the MBR scheme that is being used. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 74 ++++++++++++++++++++++++++++++------------ block/partitions/efi.h | 3 ++ 2 files changed, 56 insertions(+), 21 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1b499dc8fc78..e3cb4f19cf6d 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -158,7 +158,7 @@ static inline int pmbr_part_valid(gpt_mbr_record *part) if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) goto invalid; - return 1; + return GPT_MBR_PROTECTIVE; invalid: return 0; } @@ -167,21 +167,48 @@ invalid: * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure * - * Description: Returns 1 if PMBR is valid, 0 otherwise. - * Validity depends on two things: + * Description: Checks for a valid protective or hybrid + * master boot record (MBR). The validity of a pMBR depends + * on all of the following properties: * 1) MSDOS signature is in the last two bytes of the MBR * 2) One partition of type 0xEE is found + * + * In addition, a hybrid MBR will have up to three additional + * primary partitions, which point to the same space that's + * marked out by up to three GPT partitions. + * + * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or + * GPT_MBR_HYBRID depending on the device layout. */ -static int -is_pmbr_valid(legacy_mbr *mbr) +static int is_pmbr_valid(legacy_mbr *mbr) { - int i; + int i, ret = 0; /* invalid by default */ + if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) - return 0; + goto done; + + for (i = 0; i < 4; i++) { + ret = pmbr_part_valid(&mbr->partition_record[i]); + if (ret == GPT_MBR_PROTECTIVE) { + /* + * Ok, we at least know that there's a protective MBR, + * now check if there are other partition types for + * hybrid MBR. + */ + goto check_hybrid; + } + } + + if (ret != GPT_MBR_PROTECTIVE) + goto done; +check_hybrid: for (i = 0; i < 4; i++) - if (pmbr_part_valid(&mbr->partition_record[i])) - return 1; - return 0; + if ((mbr->partition_record[i].os_type != + EFI_PMBR_OSTYPE_EFI_GPT) && + (mbr->partition_record[i].os_type != 0x00)) + ret = GPT_MBR_HYBRID; +done: + return ret; } /** @@ -548,17 +575,22 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, lastlba = last_lba(state->bdev); if (!force_gpt) { - /* This will be added to the EFI Spec. per Intel after v1.02. */ - legacymbr = kzalloc(sizeof (*legacymbr), GFP_KERNEL); - if (legacymbr) { - read_lba(state, 0, (u8 *) legacymbr, - sizeof (*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); - kfree(legacymbr); - } - if (!good_pmbr) - goto fail; - } + /* This will be added to the EFI Spec. per Intel after v1.02. */ + legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); + if (!legacymbr) + goto fail; + + read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); + good_pmbr = is_pmbr_valid(legacymbr); + kfree(legacymbr); + + if (!good_pmbr) + goto fail; + + pr_debug("Device has a %s MBR\n", + good_pmbr == GPT_MBR_PROTECTIVE ? + "protective" : "hybrid"); + } good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, &pgpt, &pptes); diff --git a/block/partitions/efi.h b/block/partitions/efi.h index e645ecb35bf3..7fef625c04de 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -37,6 +37,9 @@ #define EFI_PMBR_OSTYPE_EFI 0xEF #define EFI_PMBR_OSTYPE_EFI_GPT 0xEE +#define GPT_MBR_PROTECTIVE 1 +#define GPT_MBR_HYBRID 2 + #define GPT_HEADER_SIGNATURE 0x5452415020494645ULL #define GPT_HEADER_REVISION_V1 0x00010000 #define GPT_PRIMARY_PARTITION_TABLE_LBA 1 From 27a7c642174eaec627f6a3a254035bf8abd02c5e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:00 -0700 Subject: [PATCH 246/303] partitions/efi: account for pmbr size in lba The partition that has the 0xEE (GPT protective), must have the size in lba field set to the lesser of the size of the disk minus one or 0xFFFFFFFF for larger disks. Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index e3cb4f19cf6d..b028af688361 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -166,6 +166,7 @@ invalid: /** * is_pmbr_valid(): test Protective MBR for validity * @mbr: pointer to a legacy mbr structure + * @total_sectors: amount of sectors in the device * * Description: Checks for a valid protective or hybrid * master boot record (MBR). The validity of a pMBR depends @@ -180,9 +181,9 @@ invalid: * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or * GPT_MBR_HYBRID depending on the device layout. */ -static int is_pmbr_valid(legacy_mbr *mbr) +static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) { - int i, ret = 0; /* invalid by default */ + int i, part = 0, ret = 0; /* invalid by default */ if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) goto done; @@ -190,6 +191,7 @@ static int is_pmbr_valid(legacy_mbr *mbr) for (i = 0; i < 4; i++) { ret = pmbr_part_valid(&mbr->partition_record[i]); if (ret == GPT_MBR_PROTECTIVE) { + part = i; /* * Ok, we at least know that there's a protective MBR, * now check if there are other partition types for @@ -207,6 +209,18 @@ check_hybrid: EFI_PMBR_OSTYPE_EFI_GPT) && (mbr->partition_record[i].os_type != 0x00)) ret = GPT_MBR_HYBRID; + + /* + * Protective MBRs take up the lesser of the whole disk + * or 2 TiB (32bit LBA), ignoring the rest of the disk. + * + * Hybrid MBRs do not necessarily comply with this. + */ + if (ret == GPT_MBR_PROTECTIVE) { + if (le32_to_cpu(mbr->partition_record[part].size_in_lba) != + min((uint32_t) total_sectors - 1, 0xFFFFFFFF)) + ret = 0; + } done: return ret; } @@ -568,6 +582,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; + sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; u64 lastlba; if (!ptes) @@ -581,7 +596,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, goto fail; read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); - good_pmbr = is_pmbr_valid(legacymbr); + good_pmbr = is_pmbr_valid(legacymbr, total_sectors); kfree(legacymbr); if (!good_pmbr) From aa054bc93743ecce3a27f1655d59674dabc71a54 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:01 -0700 Subject: [PATCH 247/303] partitions/efi: compare first and last usable LBAs When verifying GPT header integrity, make sure that first usable LBA is smaller than last usable LBA. Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index b028af688361..de9f9bfa24bc 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -410,7 +410,12 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, (unsigned long long)lastlba); goto fail; } - + if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); + goto fail; + } /* Check that sizeof_partition_entry has the correct value */ if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { pr_debug("GUID Partitition Entry Size check failed.\n"); From 08009b30a71d9a7c252c4bd677dbd496af9dd1a2 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:02 -0700 Subject: [PATCH 248/303] partitions/efi: delete annoying emacs style comments I love emacs, but these settings for coding style are annoying when trying to open the efi.h file. More important, we already have checkpatch for that. Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 7fef625c04de..4efcafba7e64 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -130,22 +130,3 @@ typedef struct _legacy_mbr { extern int efi_partition(struct parsed_partitions *state); #endif - -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * -------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ From 70f637e90ea96187530365eb1ddff8d483ba460e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:25:03 -0700 Subject: [PATCH 249/303] partitions/efi: some style cleanups Trivial coding style cleanups - still plenty left. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Reviewed-by: Karel Zak Acked-by: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index de9f9bfa24bc..0df535fac0aa 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -25,6 +25,9 @@ * TODO: * * Changelog: + * Mon August 5th, 2013 Davidlohr Bueso + * - detect hybrid MBRs, tighter pMBR checking & cleanups. + * * Mon Nov 09 2004 Matt Domsch * - test for valid PMBR and valid PGPT before ever reading * AGPT, allow override with 'gpt' kernel command line option. @@ -289,8 +292,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, return NULL; if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), - (u8 *) pte, - count) < count) { + (u8 *) pte, count) < count) { kfree(pte); pte=NULL; return NULL; @@ -633,11 +635,8 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = pptes; kfree(agpt); kfree(aptes); - if (!good_agpt) { - printk(KERN_WARNING - "Alternate GPT is invalid, " - "using primary GPT.\n"); - } + if (!good_agpt) + printk(KERN_WARNING "Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { @@ -645,8 +644,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = aptes; kfree(pgpt); kfree(pptes); - printk(KERN_WARNING - "Primary GPT is invalid, using alternate GPT.\n"); + printk(KERN_WARNING "Primary GPT is invalid, using alternate GPT.\n"); return 1; } @@ -708,8 +706,7 @@ int efi_partition(struct parsed_partitions *state) put_partition(state, i+1, start * ssz, size * ssz); /* If this is a RAID volume, tell md */ - if (!efi_guidcmp(ptes[i].partition_type_guid, - PARTITION_LINUX_RAID_GUID)) + if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) state->parts[i + 1].flags = ADDPART_FLAG_RAID; info = &state->parts[i + 1].info; From b4bc4a18a226f46fec4ef47f2df28ea209db8b5d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 11 Sep 2013 14:25:04 -0700 Subject: [PATCH 250/303] block/partitions/efi.c: consistently use pr_foo() Cc: Davidlohr Bueso Cc: Karel Zak Cc: Matt Fleming Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/partitions/efi.c | 45 ++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 0df535fac0aa..1a5ec9a03c00 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -482,44 +482,42 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) if (!pgpt || !agpt) return; if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { - printk(KERN_WARNING - "GPT:Primary header LBA != Alt. header alternate_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->my_lba), (unsigned long long)le64_to_cpu(agpt->alternate_lba)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { - printk(KERN_WARNING - "GPT:Primary header alternate_lba != Alt. header my_lba\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)le64_to_cpu(agpt->my_lba)); error_found++; } if (le64_to_cpu(pgpt->first_usable_lba) != le64_to_cpu(agpt->first_usable_lba)) { - printk(KERN_WARNING "GPT:first_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:first_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); error_found++; } if (le64_to_cpu(pgpt->last_usable_lba) != le64_to_cpu(agpt->last_usable_lba)) { - printk(KERN_WARNING "GPT:last_usable_lbas don't match.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:last_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); error_found++; } if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { - printk(KERN_WARNING "GPT:disk_guids don't match.\n"); + pr_warn("GPT:disk_guids don't match.\n"); error_found++; } if (le32_to_cpu(pgpt->num_partition_entries) != le32_to_cpu(agpt->num_partition_entries)) { - printk(KERN_WARNING "GPT:num_partition_entries don't match: " + pr_warn("GPT:num_partition_entries don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->num_partition_entries), le32_to_cpu(agpt->num_partition_entries)); @@ -527,8 +525,7 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) } if (le32_to_cpu(pgpt->sizeof_partition_entry) != le32_to_cpu(agpt->sizeof_partition_entry)) { - printk(KERN_WARNING - "GPT:sizeof_partition_entry values don't match: " + pr_warn("GPT:sizeof_partition_entry values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->sizeof_partition_entry), le32_to_cpu(agpt->sizeof_partition_entry)); @@ -536,34 +533,30 @@ compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) } if (le32_to_cpu(pgpt->partition_entry_array_crc32) != le32_to_cpu(agpt->partition_entry_array_crc32)) { - printk(KERN_WARNING - "GPT:partition_entry_array_crc32 values don't match: " + pr_warn("GPT:partition_entry_array_crc32 values don't match: " "0x%x != 0x%x\n", le32_to_cpu(pgpt->partition_entry_array_crc32), le32_to_cpu(agpt->partition_entry_array_crc32)); error_found++; } if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(pgpt->alternate_lba), (unsigned long long)lastlba); error_found++; } if (le64_to_cpu(agpt->my_lba) != lastlba) { - printk(KERN_WARNING - "GPT:Alternate GPT header not at the end of the disk.\n"); - printk(KERN_WARNING "GPT:%lld != %lld\n", + pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", (unsigned long long)le64_to_cpu(agpt->my_lba), (unsigned long long)lastlba); error_found++; } if (error_found) - printk(KERN_WARNING - "GPT: Use GNU Parted to correct GPT errors.\n"); + pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); return; } @@ -636,7 +629,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, kfree(agpt); kfree(aptes); if (!good_agpt) - printk(KERN_WARNING "Alternate GPT is invalid, using primary GPT.\n"); + pr_warn("Alternate GPT is invalid, using primary GPT.\n"); return 1; } else if (good_agpt) { @@ -644,7 +637,7 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, *ptes = aptes; kfree(pgpt); kfree(pptes); - printk(KERN_WARNING "Primary GPT is invalid, using alternate GPT.\n"); + pr_warn("Primary GPT is invalid, using alternate GPT.\n"); return 1; } From 9dee5c51516d2c3fff22633c1272c5652e68075a Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:10 -0700 Subject: [PATCH 251/303] rbtree: add postorder iteration functions Postorder iteration yields all of a node's children prior to yielding the node itself, and this particular implementation also avoids examining the leaf links in a node after that node has been yielded. In what I expect will be its most common usage, postorder iteration allows the deletion of every node in an rbtree without modifying the rbtree nodes (no _requirement_ that they be nulled) while avoiding referencing child nodes after they have been "deleted" (most commonly, freed). I have only updated zswap to use this functionality at this point, but numerous bits of code (most notably in the filesystem drivers) use a hand rolled postorder iteration that NULLs child links as it traverses the tree. Each of those instances could be replaced with this common implementation. 1 & 2 add rbtree postorder iteration functions. 3 adds testing of the iteration to the rbtree runtime tests 4 allows building the rbtree runtime tests as builtins 5 updates zswap. This patch: Add postorder iteration functions for rbtree. These are useful for safely freeing an entire rbtree without modifying the tree at all. Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 4 ++++ lib/rbtree.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 0022c1bb1e26..c467151e9950 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -68,6 +68,10 @@ extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); +/* Postorder iteration - always visit the parent after its children */ +extern struct rb_node *rb_first_postorder(const struct rb_root *); +extern struct rb_node *rb_next_postorder(const struct rb_node *); + /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); diff --git a/lib/rbtree.c b/lib/rbtree.c index c0e31fe2fabf..65f4effd117f 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -518,3 +518,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new, *new = *victim; } EXPORT_SYMBOL(rb_replace_node); + +static struct rb_node *rb_left_deepest_node(const struct rb_node *node) +{ + for (;;) { + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + else + return (struct rb_node *)node; + } +} + +struct rb_node *rb_next_postorder(const struct rb_node *node) +{ + const struct rb_node *parent; + if (!node) + return NULL; + parent = rb_parent(node); + + /* If we're sitting on node, we've already seen our children */ + if (parent && node == parent->rb_left && parent->rb_right) { + /* If we are the parent's left node, go to the parent's right + * node then all the way down to the left */ + return rb_left_deepest_node(parent->rb_right); + } else + /* Otherwise we are the parent's right node, and the parent + * should be next */ + return (struct rb_node *)parent; +} +EXPORT_SYMBOL(rb_next_postorder); + +struct rb_node *rb_first_postorder(const struct rb_root *root) +{ + if (!root->rb_node) + return NULL; + + return rb_left_deepest_node(root->rb_node); +} +EXPORT_SYMBOL(rb_first_postorder); From 2b529089257705499207ce7da9d0e3ae26a844ba Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:11 -0700 Subject: [PATCH 252/303] rbtree: add rbtree_postorder_for_each_entry_safe() helper Because deletion (of the entire tree) is a relatively common use of the rbtree_postorder iteration, and because doing it safely means fiddling with temporary storage, provide a helper to simplify postorder rbtree iteration. Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index c467151e9950..aa870a4ddf54 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -85,4 +85,22 @@ static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, *rb_link = node; } +/** + * rbtree_postorder_for_each_entry_safe - iterate over rb_root in post order of + * given type safe against removal of rb_node entry + * + * @pos: the 'type *' to use as a loop cursor. + * @n: another 'type *' to use as temporary storage + * @root: 'rb_root *' of the rbtree. + * @field: the name of the rb_node field within 'type'. + */ +#define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ + for (pos = rb_entry(rb_first_postorder(root), typeof(*pos), field),\ + n = rb_entry(rb_next_postorder(&pos->field), \ + typeof(*pos), field); \ + &pos->field; \ + pos = n, \ + n = rb_entry(rb_next_postorder(&pos->field), \ + typeof(*pos), field)) + #endif /* _LINUX_RBTREE_H */ From a791a62fdf288b2658646e2052400d456874790e Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:17 -0700 Subject: [PATCH 253/303] rbtree_test: add test for postorder iteration Just check that we examine all nodes in the tree for the postorder iteration. Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/rbtree_test.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 122f02f9941b..31dd4ccd3baa 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -114,6 +114,16 @@ static int black_path_count(struct rb_node *rb) return count; } +static void check_postorder(int nr_nodes) +{ + struct rb_node *rb; + int count = 0; + for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb)) + count++; + + WARN_ON_ONCE(count != nr_nodes); +} + static void check(int nr_nodes) { struct rb_node *rb; @@ -136,6 +146,8 @@ static void check(int nr_nodes) WARN_ON_ONCE(count != nr_nodes); WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1); + + check_postorder(nr_nodes); } static void check_augmented(int nr_nodes) From 7c993e11aa59d9d1cefbd6acc8d84f2d8d46545a Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:19 -0700 Subject: [PATCH 254/303] rbtree: allow tests to run as builtin No reason require rbtree test code to be a module, allow it to be builtin (streamlines my development process) Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 652bea9054f0..c9eef36739a9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1461,7 +1461,7 @@ config BACKTRACE_SELF_TEST config RBTREE_TEST tristate "Red-Black tree test" - depends on m && DEBUG_KERNEL + depends on DEBUG_KERNEL help A benchmark measuring the performance of the rbtree library. Also includes rbtree invariant checks. From 0bd42136f7ae4ea1375da34c32838fb35eee8c59 Mon Sep 17 00:00:00 2001 From: Cody P Schafer Date: Wed, 11 Sep 2013 14:25:33 -0700 Subject: [PATCH 255/303] mm/zswap: use postorder iteration when destroying rbtree Signed-off-by: Cody P Schafer Reviewed-by: Seth Jennings Cc: David Woodhouse Cc: Rik van Riel Cc: Michel Lespinasse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index efed4c8b7f5b..841e35f1db22 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -790,26 +790,14 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) static void zswap_frontswap_invalidate_area(unsigned type) { struct zswap_tree *tree = zswap_trees[type]; - struct rb_node *node; - struct zswap_entry *entry; + struct zswap_entry *entry, *n; if (!tree) return; /* walk the tree and free everything */ spin_lock(&tree->lock); - /* - * TODO: Even though this code should not be executed because - * the try_to_unuse() in swapoff should have emptied the tree, - * it is very wasteful to rebalance the tree after every - * removal when we are freeing the whole tree. - * - * If post-order traversal code is ever added to the rbtree - * implementation, it should be used here. - */ - while ((node = rb_first(&tree->rbroot))) { - entry = rb_entry(node, struct zswap_entry, rbnode); - rb_erase(&entry->rbnode, &tree->rbroot); + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { zbud_free(tree->pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); From 190519cd30884215a63ed875ac074dc97a602522 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:39 -0700 Subject: [PATCH 256/303] aoe: create and destroy debugfs directory for aoe This series adds the debugging information that the coraid.com-distributed aoe driver exports via sysfs, but instead of sysfs, it uses debugfs. With these patches applied, even without AoE targets on the network, KEDR reports new possible memory leaks, but these are from callers outside the aoe driver that have used aoe_devnode to get the name of the character devices through the aoe_class->devnode callback, and I believe they're responsible for freeing that memory. This patch: Create and destroy the debugfs directory. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoeblk.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 916d9ed5c8aa..cb508b754377 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -17,11 +17,13 @@ #include #include #include +#include #include #include "aoe.h" static DEFINE_MUTEX(aoeblk_mutex); static struct kmem_cache *buf_pool_cache; +static struct dentry *aoe_debugfs_dir; /* GPFS needs a larger value than the default. */ static int aoe_maxsectors; @@ -351,6 +353,8 @@ err: void aoeblk_exit(void) { + debugfs_remove_recursive(aoe_debugfs_dir); + aoe_debugfs_dir = NULL; kmem_cache_destroy(buf_pool_cache); } @@ -362,7 +366,11 @@ aoeblk_init(void) 0, 0, NULL); if (buf_pool_cache == NULL) return -ENOMEM; - + aoe_debugfs_dir = debugfs_create_dir("aoe", NULL); + if (IS_ERR_OR_NULL(aoe_debugfs_dir)) { + pr_info("aoe: cannot create debugfs directory\n"); + aoe_debugfs_dir = NULL; + } return 0; } From e8866cf2b90f3a29859d2113c0fd23daf189c282 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:40 -0700 Subject: [PATCH 257/303] aoe: add AoE-target files to debugfs Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 ++ drivers/block/aoe/aoeblk.c | 35 +++++++++++++++++++++++++++++++++++ drivers/block/aoe/aoedev.c | 1 + 3 files changed, 38 insertions(+) diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index 025c41d3cb33..b1f24c5a6bd1 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -169,6 +169,7 @@ struct aoedev { ulong ref; struct work_struct work;/* disk create work struct */ struct gendisk *gd; + struct dentry *debugfs; struct request_queue *blkq; struct hd_geometry geo; sector_t ssize; @@ -206,6 +207,7 @@ struct ktstate { int aoeblk_init(void); void aoeblk_exit(void); void aoeblk_gdalloc(void *); +void aoedisk_rm_debugfs(struct aoedev *d); void aoedisk_rm_sysfs(struct aoedev *d); int aoechr_init(void); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index cb508b754377..d76c5cb3c708 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -132,6 +132,40 @@ static const struct attribute_group attr_group = { .attrs = aoe_attrs, }; +static const struct file_operations aoe_debugfs_fops; + +static void +aoedisk_add_debugfs(struct aoedev *d) +{ + struct dentry *entry; + char *p; + + if (aoe_debugfs_dir == NULL) + return; + p = strchr(d->gd->disk_name, '/'); + if (p == NULL) + p = d->gd->disk_name; + else + p++; + BUG_ON(*p == '\0'); + entry = debugfs_create_file(p, 0444, aoe_debugfs_dir, d, + &aoe_debugfs_fops); + if (IS_ERR_OR_NULL(entry)) { + pr_info("aoe: cannot create debugfs file for %s\n", + d->gd->disk_name); + return; + } + BUG_ON(d->debugfs); + d->debugfs = entry; +} +void +aoedisk_rm_debugfs(struct aoedev *d) +{ + BUG_ON(d->debugfs == NULL); + debugfs_remove(d->debugfs); + d->debugfs = NULL; +} + static int aoedisk_add_sysfs(struct aoedev *d) { @@ -332,6 +366,7 @@ aoeblk_gdalloc(void *vp) add_disk(gd); aoedisk_add_sysfs(d); + aoedisk_add_debugfs(d); spin_lock_irqsave(&d->lock, flags); WARN_ON(!(d->flags & DEVFL_GD_NOW)); diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index 784c92e038d1..c9047675dfc9 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -278,6 +278,7 @@ freedev(struct aoedev *d) del_timer_sync(&d->timer); if (d->gd) { + aoedisk_rm_debugfs(d); aoedisk_rm_sysfs(d); del_gendisk(d->gd); put_disk(d->gd); From 1cf94797c2bbc514c2bd0892e1774a0a8ca9afdb Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:41 -0700 Subject: [PATCH 258/303] aoe: provide file operations for debugfs files The place holder in the file contents is filled out in the following patch. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoeblk.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index d76c5cb3c708..0511d38e412d 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -110,6 +110,24 @@ static ssize_t aoedisk_show_payload(struct device *dev, return snprintf(page, PAGE_SIZE, "%lu\n", d->maxbcnt); } +static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) +{ + struct aoedev *d; + unsigned long flags; + + d = s->private; + spin_lock_irqsave(&d->lock, flags); + seq_printf(s, "%s\n", d->gd->disk_name); /* place holder */ + spin_unlock_irqrestore(&d->lock, flags); + + return 0; +} + +static int aoe_debugfs_open(struct inode *inode, struct file *file) +{ + return single_open(file, aoedisk_debugfs_show, inode->i_private); +} + static DEVICE_ATTR(state, S_IRUGO, aoedisk_show_state, NULL); static DEVICE_ATTR(mac, S_IRUGO, aoedisk_show_mac, NULL); static DEVICE_ATTR(netif, S_IRUGO, aoedisk_show_netif, NULL); @@ -132,7 +150,12 @@ static const struct attribute_group attr_group = { .attrs = aoe_attrs, }; -static const struct file_operations aoe_debugfs_fops; +static const struct file_operations aoe_debugfs_fops = { + .open = aoe_debugfs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; static void aoedisk_add_debugfs(struct aoedev *d) From 2256c1c51e98d4eb2063a7f84f9ea783fda95f7f Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:42 -0700 Subject: [PATCH 259/303] aoe: fill in per-AoE-target information for debugfs file This information is presented in a compact format that has evolved for easy routine scanning by expert humans, mostly developers and support technicians helping to troubleshoot or test AoE-based systems. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoeblk.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 0511d38e412d..b58cbeb43e05 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -113,11 +113,42 @@ static ssize_t aoedisk_show_payload(struct device *dev, static int aoedisk_debugfs_show(struct seq_file *s, void *ignored) { struct aoedev *d; + struct aoetgt **t, **te; + struct aoeif *ifp, *ife; unsigned long flags; + char c; d = s->private; + seq_printf(s, "rttavg: %d rttdev: %d\n", + d->rttavg >> RTTSCALE, + d->rttdev >> RTTDSCALE); + seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool)); + seq_printf(s, "kicked: %ld\n", d->kicked); + seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt); + seq_printf(s, "ref: %ld\n", d->ref); + spin_lock_irqsave(&d->lock, flags); - seq_printf(s, "%s\n", d->gd->disk_name); /* place holder */ + t = d->targets; + te = t + d->ntargets; + for (; t < te && *t; t++) { + c = '\t'; + seq_printf(s, "falloc: %ld\n", (*t)->falloc); + seq_printf(s, "ffree: %p\n", + list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next); + seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout, + (*t)->maxout, (*t)->nframes); + seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh); + seq_printf(s, "\ttaint:%d\n", (*t)->taint); + seq_printf(s, "\tr:%d\n", (*t)->rpkts); + seq_printf(s, "\tw:%d\n", (*t)->wpkts); + ifp = (*t)->ifs; + ife = ifp + ARRAY_SIZE((*t)->ifs); + for (; ifp->nd && ifp < ife; ifp++) { + seq_printf(s, "%c%s", c, ifp->nd->name); + c = ','; + } + seq_puts(s, "\n"); + } spin_unlock_irqrestore(&d->lock, flags); return 0; From ec345120c571847dea3d3bef76dd9b7978fa794e Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:43 -0700 Subject: [PATCH 260/303] aoe: update copyright date Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoeblk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b58cbeb43e05..d63dcf0f2266 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoeblk.c * block device routines From 896dcd9a64a86d8792302615ee8ab118dc8afd9c Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:44 -0700 Subject: [PATCH 261/303] aoe: update internal version number to 85 Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoe.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h index b1f24c5a6bd1..14a9d1912318 100644 --- a/drivers/block/aoe/aoe.h +++ b/drivers/block/aoe/aoe.h @@ -1,5 +1,5 @@ /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ -#define VERSION "83" +#define VERSION "85" #define AOE_MAJOR 152 #define DEVICE_NAME "aoe" From a88c1f0caccaa335690d53ea03b12de31c357263 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 11 Sep 2013 14:25:44 -0700 Subject: [PATCH 262/303] aoe: remove custom implementation of kbasename() In the kernel we have a nice helper that may be used here. This patch substitutes the custom implementation by the native function call. Signed-off-by: Andy Shevchenko Cc: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoedev.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index c9047675dfc9..e774c50b6842 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "aoe.h" static void dummy_timer(ulong); @@ -241,16 +242,12 @@ aoedev_downdev(struct aoedev *d) static int user_req(char *s, size_t slen, struct aoedev *d) { - char *p; + const char *p; size_t lim; if (!d->gd) return 0; - p = strrchr(d->gd->disk_name, '/'); - if (!p) - p = d->gd->disk_name; - else - p += 1; + p = kbasename(d->gd->disk_name); lim = sizeof(d->gd->disk_name); lim -= p - d->gd->disk_name; if (slen < lim) From e0ec36059774ff51812b40509d28ca6c9a2a6a62 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 11 Sep 2013 14:25:45 -0700 Subject: [PATCH 263/303] aoe: suppress compiler warnings This patch fixes following compiler warnings: drivers/block/aoe/aoecmd.c: In function `aoecmd_ata_rw': drivers/block/aoe/aoecmd.c:383:17: warning: variable `t' set but not used [-Wunused-but-set-variable] struct aoetgt *t; ^ drivers/block/aoe/aoecmd.c: In function `resend': drivers/block/aoe/aoecmd.c:488:21: warning: variable `ah' set but not used [-Wunused-but-set-variable] struct aoe_atahdr *ah; ^ Signed-off-by: Andy Shevchenko Cc: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoecmd.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 4d45dba7fb8f..d2515435e23f 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -380,7 +380,6 @@ aoecmd_ata_rw(struct aoedev *d) { struct frame *f; struct buf *buf; - struct aoetgt *t; struct sk_buff *skb; struct sk_buff_head queue; ulong bcnt, fbcnt; @@ -391,7 +390,6 @@ aoecmd_ata_rw(struct aoedev *d) f = newframe(d); if (f == NULL) return 0; - t = *d->tgt; bcnt = d->maxbcnt; if (bcnt == 0) bcnt = DEFAULTBCNT; @@ -485,7 +483,6 @@ resend(struct aoedev *d, struct frame *f) struct sk_buff *skb; struct sk_buff_head queue; struct aoe_hdr *h; - struct aoe_atahdr *ah; struct aoetgt *t; char buf[128]; u32 n; @@ -500,7 +497,6 @@ resend(struct aoedev *d, struct frame *f) return; } h = (struct aoe_hdr *) skb_mac_header(skb); - ah = (struct aoe_atahdr *) (h+1); if (!(f->flags & FFL_PROBE)) { snprintf(buf, sizeof(buf), From fea1b139735355fe17a66f63811c58698ff03ec5 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:46 -0700 Subject: [PATCH 264/303] aoe: do not BUG if memory pressure prevented debugfs file creation If the system has trouble allocating memory for the creation of the aoe debugfs directory or of a file inside it, the debugfs member of an aoedev can be NULL. Do not treat a NULL debugfs pointer as a BUG on aoedev shutdown, avoiding the user impact of an unecessary panic. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/aoe/aoeblk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index d63dcf0f2266..dd73e1ff1759 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -215,7 +215,6 @@ aoedisk_add_debugfs(struct aoedev *d) void aoedisk_rm_debugfs(struct aoedev *d) { - BUG_ON(d->debugfs == NULL); debugfs_remove(d->debugfs); d->debugfs = NULL; } From 5173b414e42cb81e764f5e92a2f143e9a84fa3d1 Mon Sep 17 00:00:00 2001 From: Ed Cashin Date: Wed, 11 Sep 2013 14:25:47 -0700 Subject: [PATCH 265/303] aoe: remove do-nothing NAME="%k" term from example udev rules When the example udev rules in the documentation are used without modification, warnings like the one shown below appear in the system logs: /var/log/messages:Aug 22 11:09:11 kung udevd[445]: NAME="%k" \ is superfluous and breaks kernel supplied names, please remove \ it from /etc/udev/rules.d/60-aoe.rules:26 Removing the term does not cause any problems with the creation of the special character and block device nodes. Signed-off-by: Ed Cashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/aoe/udev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/aoe/udev.txt b/Documentation/aoe/udev.txt index 8686e789542e..1f06daf03f5b 100644 --- a/Documentation/aoe/udev.txt +++ b/Documentation/aoe/udev.txt @@ -23,4 +23,4 @@ SUBSYSTEM=="aoe", KERNEL=="revalidate", NAME="etherd/%k", GROUP="disk", MODE="02 SUBSYSTEM=="aoe", KERNEL=="flush", NAME="etherd/%k", GROUP="disk", MODE="0220" # aoe block devices -KERNEL=="etherd*", NAME="%k", GROUP="disk" +KERNEL=="etherd*", GROUP="disk" From 6325932666540beea18c800016368dc921068611 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Sep 2013 14:25:48 -0700 Subject: [PATCH 266/303] affs: use loff_t in affs_truncate() It seems pretty unlikely that AFFS supports files over 4GB but we may as well leave use loff_t just for cleanness sake instead of truncating it to 32 bits. Signed-off-by: Dan Carpenter Cc: Marco Stornelli Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/affs/file.c b/fs/affs/file.c index af3261b78102..776e3935a758 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -836,7 +836,7 @@ affs_truncate(struct inode *inode) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - u32 size = inode->i_size; + loff_t size = inode->i_size; int res; res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); From 6723734cdff15211bb78aeea76ca847374bd93ae Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Sep 2013 14:25:49 -0700 Subject: [PATCH 267/303] panic: call panic handlers before kmsg_dump Since the panic handlers may produce additional information (via printk) for the kernel log, it should be reported as part of the panic output saved by kmsg_dump(). Without this re-ordering, nothing that adds information to a panic will show up in pstore's view when kmsg_dump runs, and is therefore not visible to crash reporting tools that examine pstore output. Signed-off-by: Kees Cook Cc: Anton Vorontsov Cc: Colin Cross Acked-by: Tony Luck Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Peter Zijlstra Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 801864600514..b6c482ccc5db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -123,10 +123,14 @@ void panic(const char *fmt, ...) */ smp_send_stop(); - kmsg_dump(KMSG_DUMP_PANIC); - + /* + * Run any panic handlers, including those that might need to + * add information to the kmsg dump output. + */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); if (!panic_blink) From 5323fb770b6254a4c218a2dfb0ef9aa007b7725a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:51 -0700 Subject: [PATCH 268/303] pktcdvd: convert ZONE macro to static function get_zone() Macros should be converted to functions where feasible to verify arguments and the like. Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index f5d0ea11d9fd..aa18a2584249 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -83,9 +83,6 @@ #define MAX_SPEED 0xffff -#define ZONE(sector, pd) (((sector) + (pd)->offset) & \ - ~(sector_t)((pd)->settings.size - 1)) - static DEFINE_MUTEX(pktcdvd_mutex); static struct pktcdvd_device *pkt_devs[MAX_WRITERS]; static struct proc_dir_entry *pkt_proc; @@ -103,7 +100,10 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev); static int pkt_remove_dev(dev_t pkt_dev); static int pkt_seq_show(struct seq_file *m, void *p); - +static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd) +{ + return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1); +} /* * create and register a pktcdvd kernel object. @@ -1224,7 +1224,7 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) node = first_node; while (node) { bio = node->bio; - zone = ZONE(bio->bi_sector, pd); + zone = get_zone(bio->bi_sector, pd); list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) { if (p->sector == zone) { bio = NULL; @@ -1264,8 +1264,8 @@ try_next_bio: while ((node = pkt_rbtree_find(pd, zone)) != NULL) { bio = node->bio; VPRINTK("pkt_handle_queue: found zone=%llx\n", - (unsigned long long)ZONE(bio->bi_sector, pd)); - if (ZONE(bio->bi_sector, pd) != zone) + (unsigned long long)get_zone(bio->bi_sector, pd)); + if (get_zone(bio->bi_sector, pd) != zone) break; pkt_rbtree_erase(pd, node); spin_lock(&pkt->lock); @@ -2394,7 +2394,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); - zone = ZONE(bio->bi_sector, pd); + zone = get_zone(bio->bi_sector, pd); VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_sector, (unsigned long long)bio_end_sector(bio)); @@ -2405,7 +2405,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) sector_t last_zone; int first_sectors; - last_zone = ZONE(bio_end_sector(bio) - 1, pd); + last_zone = get_zone(bio_end_sector(bio) - 1, pd); if (last_zone != zone) { BUG_ON(last_zone != zone + pd->settings.size); first_sectors = last_zone - bio->bi_sector; @@ -2500,7 +2500,7 @@ static int pkt_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd, struct bio_vec *bvec) { struct pktcdvd_device *pd = q->queuedata; - sector_t zone = ZONE(bmd->bi_sector, pd); + sector_t zone = get_zone(bmd->bi_sector, pd); int used = ((bmd->bi_sector - zone) << 9) + bmd->bi_size; int remaining = (pd->settings.size << 9) - used; int remaining2; From 99481334bcab1330fce0c590b845b4a95b101a69 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:52 -0700 Subject: [PATCH 269/303] pktcdvd: convert printk to pr_ Use a more current logging style and add messages levels to the logging messages. Simplify pkt_dump_sense by using %*ph and adding a simple function to emit the sense string. Includes improvements from Andy Shevchenko and Dan Carpenter. Signed-off-by: Joe Perches Cc: Andy Shevchenko Cc: Dan Carpenter Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 122 ++++++++++++++++++++-------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index aa18a2584249..eb71522c20ab 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -44,6 +44,8 @@ * *************************************************************************/ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -424,7 +426,7 @@ static int pkt_sysfs_init(void) if (ret) { kfree(class_pktcdvd); class_pktcdvd = NULL; - printk(DRIVER_NAME": failed to create class pktcdvd\n"); + pr_err("failed to create class pktcdvd\n"); return ret; } return 0; @@ -734,36 +736,32 @@ out: return ret; } +static const char *sense_key_string(__u8 index) +{ + static const char * const info[] = { + "No sense", "Recovered error", "Not ready", + "Medium error", "Hardware error", "Illegal request", + "Unit attention", "Data protect", "Blank check", + }; + + return index < ARRAY_SIZE(info) ? info[index] : "INVALID"; +} + /* * A generic sense dump / resolve mechanism should be implemented across * all ATAPI + SCSI devices. */ static void pkt_dump_sense(struct packet_command *cgc) { - static char *info[9] = { "No sense", "Recovered error", "Not ready", - "Medium error", "Hardware error", "Illegal request", - "Unit attention", "Data protect", "Blank check" }; - int i; struct request_sense *sense = cgc->sense; - printk(DRIVER_NAME":"); - for (i = 0; i < CDROM_PACKET_SIZE; i++) - printk(" %02x", cgc->cmd[i]); - printk(" - "); - - if (sense == NULL) { - printk("no sense\n"); - return; - } - - printk("sense %02x.%02x.%02x", sense->sense_key, sense->asc, sense->ascq); - - if (sense->sense_key > 8) { - printk(" (INVALID)\n"); - return; - } - - printk(" (%s)\n", info[sense->sense_key]); + if (sense) + pr_err("%*ph - sense %02x.%02x.%02x (%s)\n", + CDROM_PACKET_SIZE, cgc->cmd, + sense->sense_key, sense->asc, sense->ascq, + sense_key_string(sense->sense_key)); + else + pr_err("%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); } /* @@ -943,7 +941,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que set_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; } else { - printk(DRIVER_NAME": cdrom max_phys_segments too small\n"); + pr_err("cdrom max_phys_segments too small\n"); return -EIO; } } @@ -1563,9 +1561,10 @@ work_to_do: static void pkt_print_settings(struct pktcdvd_device *pd) { - printk(DRIVER_NAME": %s packets, ", pd->settings.fp ? "Fixed" : "Variable"); - printk("%u blocks, ", pd->settings.size >> 2); - printk("Mode-%c disc\n", pd->settings.block_mode == 8 ? '1' : '2'); + pr_info("%s packets, %u blocks, Mode-%c disc\n", + pd->settings.fp ? "Fixed" : "Variable", + pd->settings.size >> 2, + pd->settings.block_mode == 8 ? '1' : '2'); } static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) @@ -1749,7 +1748,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) /* * paranoia */ - printk(DRIVER_NAME": write mode wrong %d\n", wp->data_block_type); + pr_err("write mode wrong %d\n", wp->data_block_type); return 1; } wp->packet_size = cpu_to_be32(pd->settings.size >> 2); @@ -1793,7 +1792,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) if (ti->rt == 1 && ti->blank == 0) return 1; - printk(DRIVER_NAME": bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + pr_err("bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); return 0; } @@ -1820,22 +1819,22 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) * but i'm not sure, should we leave this to user apps? probably. */ if (di->disc_type == 0xff) { - printk(DRIVER_NAME": Unknown disc. No track?\n"); + pr_notice("unknown disc - no track?\n"); return 0; } if (di->disc_type != 0x20 && di->disc_type != 0) { - printk(DRIVER_NAME": Wrong disc type (%x)\n", di->disc_type); + pr_err("wrong disc type (%x)\n", di->disc_type); return 0; } if (di->erasable == 0) { - printk(DRIVER_NAME": Disc not erasable\n"); + pr_notice("disc not erasable\n"); return 0; } if (di->border_status == PACKET_SESSION_RESERVED) { - printk(DRIVER_NAME": Can't write to last track (reserved)\n"); + pr_err("can't write to last track (reserved)\n"); return 0; } @@ -1860,7 +1859,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) memset(&ti, 0, sizeof(track_information)); if ((ret = pkt_get_disc_info(pd, &di))) { - printk("failed get_disc\n"); + pr_err("failed get_disc\n"); return ret; } @@ -1871,12 +1870,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { - printk(DRIVER_NAME": failed get_track\n"); + pr_err("failed get_track\n"); return ret; } if (!pkt_writable_track(pd, &ti)) { - printk(DRIVER_NAME": can't write to this track\n"); + pr_err("can't write to this track\n"); return -EROFS; } @@ -1886,11 +1885,11 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; if (pd->settings.size == 0) { - printk(DRIVER_NAME": detected zero packet size!\n"); + pr_notice("detected zero packet size!\n"); return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { - printk(DRIVER_NAME": packet size is too big\n"); + pr_err("packet size is too big\n"); return -EROFS; } pd->settings.fp = ti.fp; @@ -1932,7 +1931,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) pd->settings.block_mode = PACKET_BLOCK_MODE2; break; default: - printk(DRIVER_NAME": unknown data mode\n"); + pr_err("unknown data mode\n"); return -EROFS; } return 0; @@ -1966,10 +1965,10 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); ret = pkt_mode_select(pd, &cgc); if (ret) { - printk(DRIVER_NAME": write caching control failed\n"); + pr_err("write caching control failed\n"); pkt_dump_sense(&cgc); } else if (!ret && set) - printk(DRIVER_NAME": enabled write caching on %s\n", pd->name); + pr_notice("enabled write caching on %s\n", pd->name); return ret; } @@ -2084,11 +2083,11 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, } if (!(buf[6] & 0x40)) { - printk(DRIVER_NAME": Disc type is not CD-RW\n"); + pr_notice("disc type is not CD-RW\n"); return 1; } if (!(buf[6] & 0x4)) { - printk(DRIVER_NAME": A1 values on media are not valid, maybe not CDRW?\n"); + pr_notice("A1 values on media are not valid, maybe not CDRW?\n"); return 1; } @@ -2108,14 +2107,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, *speed = us_clv_to_speed[sp]; break; default: - printk(DRIVER_NAME": Unknown disc sub-type %d\n",st); + pr_notice("unknown disc sub-type %d\n", st); return 1; } if (*speed) { - printk(DRIVER_NAME": Max. media speed: %d\n",*speed); + pr_info("maximum media speed: %d\n", *speed); return 0; } else { - printk(DRIVER_NAME": Unknown speed %d for sub-type %d\n",sp,st); + pr_notice("unknown speed %d for sub-type %d\n", sp, st); return 1; } } @@ -2205,7 +2204,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) goto out; if ((ret = pkt_get_last_written(pd, &lba))) { - printk(DRIVER_NAME": pkt_get_last_written failed\n"); + pr_err("pkt_get_last_written failed\n"); goto out_putdev; } @@ -2235,11 +2234,11 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - printk(DRIVER_NAME": not enough memory for buffers\n"); + pr_err("not enough memory for buffers\n"); ret = -ENOMEM; goto out_putdev; } - printk(DRIVER_NAME": %lukB available on disc\n", lba << 1); + pr_info("%lukB available on disc\n", lba << 1); } return 0; @@ -2360,7 +2359,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) pd = q->queuedata; if (!pd) { - printk(DRIVER_NAME": %s incorrect request queue\n", bdevname(bio->bi_bdev, b)); + pr_err("%s incorrect request queue\n", + bdevname(bio->bi_bdev, b)); goto end_io; } @@ -2382,13 +2382,13 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - printk(DRIVER_NAME": WRITE for ro device %s (%llu)\n", - pd->name, (unsigned long long)bio->bi_sector); + pr_notice("WRITE for ro device %s (%llu)\n", + pd->name, (unsigned long long)bio->bi_sector); goto end_io; } if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) { - printk(DRIVER_NAME": wrong bio size\n"); + pr_err("wrong bio size\n"); goto end_io; } @@ -2609,7 +2609,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) struct block_device *bdev; if (pd->pkt_dev == dev) { - printk(DRIVER_NAME": Recursive setup not allowed\n"); + pr_err("recursive setup not allowed\n"); return -EBUSY; } for (i = 0; i < MAX_WRITERS; i++) { @@ -2617,11 +2617,11 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - printk(DRIVER_NAME": %s already setup\n", bdevname(pd2->bdev, b)); + pr_err("%s already setup\n", bdevname(pd2->bdev, b)); return -EBUSY; } if (pd2->pkt_dev == dev) { - printk(DRIVER_NAME": Can't chain pktcdvd devices\n"); + pr_err("can't chain pktcdvd devices\n"); return -EBUSY; } } @@ -2644,7 +2644,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) atomic_set(&pd->cdrw.pending_bios, 0); pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); if (IS_ERR(pd->cdrw.thread)) { - printk(DRIVER_NAME": can't start kernel thread\n"); + pr_err("can't start kernel thread\n"); ret = -ENOMEM; goto out_mem; } @@ -2743,7 +2743,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) if (!pkt_devs[idx]) break; if (idx == MAX_WRITERS) { - printk(DRIVER_NAME": max %d writers supported\n", MAX_WRITERS); + pr_err("max %d writers supported\n", MAX_WRITERS); ret = -EBUSY; goto out_mutex; } @@ -2818,7 +2818,7 @@ out_mem: kfree(pd); out_mutex: mutex_unlock(&ctl_mutex); - printk(DRIVER_NAME": setup of pktcdvd device failed\n"); + pr_err("setup of pktcdvd device failed\n"); return ret; } @@ -2969,7 +2969,7 @@ static int __init pkt_init(void) ret = register_blkdev(pktdev_major, DRIVER_NAME); if (ret < 0) { - printk(DRIVER_NAME": Unable to register block device\n"); + pr_err("unable to register block device\n"); goto out2; } if (!pktdev_major) @@ -2983,7 +2983,7 @@ static int __init pkt_init(void) ret = misc_register(&pkt_misc); if (ret) { - printk(DRIVER_NAME": Unable to register misc device\n"); + pr_err("unable to register misc device\n"); goto out_misc; } From cd3f2cd05cf1066f3975ed3223f12c799bc553c6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:53 -0700 Subject: [PATCH 270/303] pktcdvd: consolidate DPRINTK and VPRINTK macros Use the more common pkt_dbg(level, fmt, ...) form. These messages are emitted at KERN_NOTICE. Always emit function name with pkt_dbg(2, ...) uses and remove the sometimes abbreviated embedded function name. This form always verifies the format and arguments. Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 107 ++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 54 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index eb71522c20ab..aaa5da2b3d66 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -71,17 +71,13 @@ #define DRIVER_NAME "pktcdvd" -#if PACKET_DEBUG -#define DPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args) -#else -#define DPRINTK(fmt, args...) -#endif - -#if PACKET_DEBUG > 1 -#define VPRINTK(fmt, args...) printk(KERN_NOTICE fmt, ##args) -#else -#define VPRINTK(fmt, args...) -#endif +#define pkt_dbg(level, fmt, ...) \ +do { \ + if (level == 2 && PACKET_DEBUG >= 2) \ + pr_notice("%s: " fmt, __func__, ##__VA_ARGS__); \ + else if (level == 1 && PACKET_DEBUG >= 1) \ + pr_notice(fmt, ##__VA_ARGS__); \ +} while (0) #define MAX_SPEED 0xffff @@ -519,7 +515,7 @@ static void pkt_bio_finished(struct pktcdvd_device *pd) { BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - VPRINTK(DRIVER_NAME": queue empty\n"); + pkt_dbg(2, "queue empty\n"); atomic_set(&pd->iosched.attention, 1); wake_up(&pd->wqueue); } @@ -870,7 +866,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) need_write_seek = 0; if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - VPRINTK(DRIVER_NAME": write, waiting\n"); + pkt_dbg(2, "write, waiting\n"); break; } pkt_flush_cache(pd); @@ -879,7 +875,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) } else { if (!reads_queued && writes_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - VPRINTK(DRIVER_NAME": read, waiting\n"); + pkt_dbg(2, "read, waiting\n"); break; } pd->iosched.writing = 1; @@ -985,8 +981,9 @@ static void pkt_end_io_read(struct bio *bio, int err) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - VPRINTK("pkt_end_io_read: bio=%p sec0=%llx sec=%llx err=%d\n", bio, - (unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err); + pkt_dbg(2, "bio=%p sec0=%llx sec=%llx err=%d\n", + bio, (unsigned long long)pkt->sector, + (unsigned long long)bio->bi_sector, err); if (err) atomic_inc(&pkt->io_errors); @@ -1003,7 +1000,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - VPRINTK("pkt_end_io_packet_write: id=%d, err=%d\n", pkt->id, err); + pkt_dbg(2, "id=%d, err=%d\n", pkt->id, err); pd->stats.pkt_ended++; @@ -1045,7 +1042,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) spin_unlock(&pkt->lock); if (pkt->cache_valid) { - VPRINTK("pkt_gather_data: zone %llx cached\n", + pkt_dbg(2, "zone %llx cached\n", (unsigned long long)pkt->sector); goto out_account; } @@ -1068,7 +1065,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) p = (f * CD_FRAMESIZE) / PAGE_SIZE; offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - VPRINTK("pkt_gather_data: Adding frame %d, page:%p offs:%d\n", + pkt_dbg(2, "Adding frame %d, page:%p offs:%d\n", f, pkt->pages[p], offset); if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) BUG(); @@ -1080,7 +1077,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) } out_account: - VPRINTK("pkt_gather_data: need %d frames for zone %llx\n", + pkt_dbg(2, "need %d frames for zone %llx\n", frames_read, (unsigned long long)pkt->sector); pd->stats.pkt_started++; pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); @@ -1181,7 +1178,8 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" }; enum packet_data_state old_state = pkt->state; - VPRINTK("pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector, + pkt_dbg(2, "pkt %2d : s=%6llx %s -> %s\n", + pkt->id, (unsigned long long)pkt->sector, state_name[old_state], state_name[state]); #endif pkt->state = state; @@ -1200,12 +1198,12 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) struct rb_node *n; int wakeup; - VPRINTK("handle_queue\n"); + pkt_dbg(2, "\n"); atomic_set(&pd->scan_queue, 0); if (list_empty(&pd->cdrw.pkt_free_list)) { - VPRINTK("handle_queue: no pkt\n"); + pkt_dbg(2, "no pkt\n"); return 0; } @@ -1242,7 +1240,7 @@ try_next_bio: } spin_unlock(&pd->lock); if (!bio) { - VPRINTK("handle_queue: no bio\n"); + pkt_dbg(2, "no bio\n"); return 0; } @@ -1258,10 +1256,10 @@ try_next_bio: * to this packet. */ spin_lock(&pd->lock); - VPRINTK("pkt_handle_queue: looking for zone %llx\n", (unsigned long long)zone); + pkt_dbg(2, "looking for zone %llx\n", (unsigned long long)zone); while ((node = pkt_rbtree_find(pd, zone)) != NULL) { bio = node->bio; - VPRINTK("pkt_handle_queue: found zone=%llx\n", + pkt_dbg(2, "found zone=%llx\n", (unsigned long long)get_zone(bio->bi_sector, pd)); if (get_zone(bio->bi_sector, pd) != zone) break; @@ -1314,7 +1312,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) BUG(); } - VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt); + pkt_dbg(2, "vcnt=%d\n", pkt->w_bio->bi_vcnt); /* * Fill-in bvec with data from orig_bios. @@ -1325,7 +1323,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); - VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n", + pkt_dbg(2, "Writing %d frames for zone %llx\n", pkt->write_size, (unsigned long long)pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { @@ -1357,7 +1355,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data { int uptodate; - VPRINTK("run_state_machine: pkt %d\n", pkt->id); + pkt_dbg(2, "pkt %d\n", pkt->id); for (;;) { switch (pkt->state) { @@ -1396,7 +1394,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (pkt_start_recovery(pkt)) { pkt_start_write(pd, pkt); } else { - VPRINTK("No recovery possible\n"); + pkt_dbg(2, "No recovery possible\n"); pkt_set_state(pkt, PACKET_FINISHED_STATE); } break; @@ -1417,7 +1415,7 @@ static void pkt_handle_packets(struct pktcdvd_device *pd) { struct packet_data *pkt, *next; - VPRINTK("pkt_handle_packets\n"); + pkt_dbg(2, "\n"); /* * Run state machine for active packets @@ -1500,9 +1498,9 @@ static int kcdrwd(void *foobar) if (PACKET_DEBUG > 1) { int states[PACKET_NUM_STATES]; pkt_count_states(pd, states); - VPRINTK("kcdrwd: i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", - states[0], states[1], states[2], states[3], - states[4], states[5]); + pkt_dbg(2, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + states[0], states[1], states[2], + states[3], states[4], states[5]); } min_sleep_time = MAX_SCHEDULE_TIMEOUT; @@ -1511,9 +1509,9 @@ static int kcdrwd(void *foobar) min_sleep_time = pkt->sleep_time; } - VPRINTK("kcdrwd: sleeping\n"); + pkt_dbg(2, "sleeping\n"); residue = schedule_timeout(min_sleep_time); - VPRINTK("kcdrwd: wake up\n"); + pkt_dbg(2, "wake up\n"); /* make swsusp happy with our thread */ try_to_freeze(); @@ -1810,7 +1808,8 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) case 0x12: /* DVD-RAM */ return 1; default: - VPRINTK(DRIVER_NAME": Wrong disc profile (%x)\n", pd->mmc3_profile); + pkt_dbg(2, "Wrong disc profile (%x)\n", + pd->mmc3_profile); return 0; } @@ -2125,7 +2124,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) struct request_sense sense; int ret; - VPRINTK(DRIVER_NAME": Performing OPC\n"); + pkt_dbg(2, "Performing OPC\n"); init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); cgc.sense = &sense; @@ -2143,12 +2142,12 @@ static int pkt_open_write(struct pktcdvd_device *pd) unsigned int write_speed, media_write_speed, read_speed; if ((ret = pkt_probe_settings(pd))) { - VPRINTK(DRIVER_NAME": %s failed probe\n", pd->name); + pkt_dbg(2, "%s failed probe\n", pd->name); return ret; } if ((ret = pkt_set_write_settings(pd))) { - DPRINTK(DRIVER_NAME": %s failed saving write settings\n", pd->name); + pkt_dbg(1, "%s failed saving write settings\n", pd->name); return -EIO; } @@ -2160,26 +2159,26 @@ static int pkt_open_write(struct pktcdvd_device *pd) case 0x13: /* DVD-RW */ case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ - DPRINTK(DRIVER_NAME": write speed %ukB/s\n", write_speed); + pkt_dbg(1, "write speed %ukB/s\n", write_speed); break; default: if ((ret = pkt_media_speed(pd, &media_write_speed))) media_write_speed = 16; write_speed = min(write_speed, media_write_speed * 177); - DPRINTK(DRIVER_NAME": write speed %ux\n", write_speed / 176); + pkt_dbg(1, "write speed %ux\n", write_speed / 176); break; } read_speed = write_speed; if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { - DPRINTK(DRIVER_NAME": %s couldn't set write speed\n", pd->name); + pkt_dbg(1, "%s couldn't set write speed\n", pd->name); return -EIO; } pd->write_speed = write_speed; pd->read_speed = read_speed; if ((ret = pkt_perform_opc(pd))) { - DPRINTK(DRIVER_NAME": %s Optimum Power Calibration failed\n", pd->name); + pkt_dbg(1, "%s Optimum Power Calibration failed\n", pd->name); } return 0; @@ -2256,7 +2255,7 @@ out: static void pkt_release_dev(struct pktcdvd_device *pd, int flush) { if (flush && pkt_flush_cache(pd)) - DPRINTK(DRIVER_NAME": %s not flushing cache\n", pd->name); + pkt_dbg(1, "%s not flushing cache\n", pd->name); pkt_lock_door(pd, 0); @@ -2278,7 +2277,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) struct pktcdvd_device *pd = NULL; int ret; - VPRINTK(DRIVER_NAME": entering open\n"); + pkt_dbg(2, "entering\n"); mutex_lock(&pktcdvd_mutex); mutex_lock(&ctl_mutex); @@ -2314,7 +2313,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) out_dec: pd->refcnt--; out: - VPRINTK(DRIVER_NAME": failed open (%d)\n", ret); + pkt_dbg(2, "failed (%d)\n", ret); mutex_unlock(&ctl_mutex); mutex_unlock(&pktcdvd_mutex); return ret; @@ -2395,7 +2394,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); zone = get_zone(bio->bi_sector, pd); - VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n", + pkt_dbg(2, "start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_sector, (unsigned long long)bio_end_sector(bio)); @@ -2650,7 +2649,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd); - DPRINTK(DRIVER_NAME": writer %s mapped to %s\n", pd->name, bdevname(bdev, b)); + pkt_dbg(1, "writer %s mapped to %s\n", pd->name, bdevname(bdev, b)); return 0; out_mem: @@ -2665,8 +2664,8 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, struct pktcdvd_device *pd = bdev->bd_disk->private_data; int ret; - VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd, - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); + pkt_dbg(2, "cmd %x, dev %d:%d\n", + cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); mutex_lock(&pktcdvd_mutex); switch (cmd) { @@ -2690,7 +2689,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, break; default: - VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd); + pkt_dbg(2, "Unknown ioctl for %s (%x)\n", pd->name, cmd); ret = -ENOTTY; } mutex_unlock(&pktcdvd_mutex); @@ -2839,7 +2838,7 @@ static int pkt_remove_dev(dev_t pkt_dev) break; } if (idx == MAX_WRITERS) { - DPRINTK(DRIVER_NAME": dev not setup\n"); + pkt_dbg(1, "dev not setup\n"); ret = -ENXIO; goto out; } @@ -2859,7 +2858,7 @@ static int pkt_remove_dev(dev_t pkt_dev) blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); remove_proc_entry(pd->name, pkt_proc); - DPRINTK(DRIVER_NAME": writer %s unmapped\n", pd->name); + pkt_dbg(1, "writer %s unmapped\n", pd->name); del_gendisk(pd->disk); blk_cleanup_queue(pd->disk->queue); From 844aa7974395df1f0c7a866007e900e5f979fc7b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:54 -0700 Subject: [PATCH 271/303] pktcdvd: add struct pktcdvd_device * to pkt_dbg Add pd->name to output for these debugging messages. Remove normally compiled out pkt_dbg(2, ...) function entry tracing equivalents as it's better done via the function tracer. Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 90 +++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 48 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index aaa5da2b3d66..5050c7abb6ee 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -71,12 +71,13 @@ #define DRIVER_NAME "pktcdvd" -#define pkt_dbg(level, fmt, ...) \ -do { \ - if (level == 2 && PACKET_DEBUG >= 2) \ - pr_notice("%s: " fmt, __func__, ##__VA_ARGS__); \ - else if (level == 1 && PACKET_DEBUG >= 1) \ - pr_notice(fmt, ##__VA_ARGS__); \ +#define pkt_dbg(level, pd, fmt, ...) \ +do { \ + if (level == 2 && PACKET_DEBUG >= 2) \ + pr_notice("%s: %s():" fmt, \ + pd->name, __func__, ##__VA_ARGS__); \ + else if (level == 1 && PACKET_DEBUG >= 1) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \ } while (0) #define MAX_SPEED 0xffff @@ -515,7 +516,7 @@ static void pkt_bio_finished(struct pktcdvd_device *pd) { BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0); if (atomic_dec_and_test(&pd->cdrw.pending_bios)) { - pkt_dbg(2, "queue empty\n"); + pkt_dbg(2, pd, "queue empty\n"); atomic_set(&pd->iosched.attention, 1); wake_up(&pd->wqueue); } @@ -866,7 +867,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) need_write_seek = 0; if (need_write_seek && reads_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, "write, waiting\n"); + pkt_dbg(2, pd, "write, waiting\n"); break; } pkt_flush_cache(pd); @@ -875,7 +876,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) } else { if (!reads_queued && writes_queued) { if (atomic_read(&pd->cdrw.pending_bios) > 0) { - pkt_dbg(2, "read, waiting\n"); + pkt_dbg(2, pd, "read, waiting\n"); break; } pd->iosched.writing = 1; @@ -981,7 +982,7 @@ static void pkt_end_io_read(struct bio *bio, int err) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, "bio=%p sec0=%llx sec=%llx err=%d\n", + pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n", bio, (unsigned long long)pkt->sector, (unsigned long long)bio->bi_sector, err); @@ -1000,7 +1001,7 @@ static void pkt_end_io_packet_write(struct bio *bio, int err) struct pktcdvd_device *pd = pkt->pd; BUG_ON(!pd); - pkt_dbg(2, "id=%d, err=%d\n", pkt->id, err); + pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, err); pd->stats.pkt_ended++; @@ -1042,7 +1043,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) spin_unlock(&pkt->lock); if (pkt->cache_valid) { - pkt_dbg(2, "zone %llx cached\n", + pkt_dbg(2, pd, "zone %llx cached\n", (unsigned long long)pkt->sector); goto out_account; } @@ -1065,7 +1066,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) p = (f * CD_FRAMESIZE) / PAGE_SIZE; offset = (f * CD_FRAMESIZE) % PAGE_SIZE; - pkt_dbg(2, "Adding frame %d, page:%p offs:%d\n", + pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n", f, pkt->pages[p], offset); if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset)) BUG(); @@ -1077,7 +1078,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) } out_account: - pkt_dbg(2, "need %d frames for zone %llx\n", + pkt_dbg(2, pd, "need %d frames for zone %llx\n", frames_read, (unsigned long long)pkt->sector); pd->stats.pkt_started++; pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9); @@ -1178,7 +1179,7 @@ static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED" }; enum packet_data_state old_state = pkt->state; - pkt_dbg(2, "pkt %2d : s=%6llx %s -> %s\n", + pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n", pkt->id, (unsigned long long)pkt->sector, state_name[old_state], state_name[state]); #endif @@ -1198,12 +1199,10 @@ static int pkt_handle_queue(struct pktcdvd_device *pd) struct rb_node *n; int wakeup; - pkt_dbg(2, "\n"); - atomic_set(&pd->scan_queue, 0); if (list_empty(&pd->cdrw.pkt_free_list)) { - pkt_dbg(2, "no pkt\n"); + pkt_dbg(2, pd, "no pkt\n"); return 0; } @@ -1240,7 +1239,7 @@ try_next_bio: } spin_unlock(&pd->lock); if (!bio) { - pkt_dbg(2, "no bio\n"); + pkt_dbg(2, pd, "no bio\n"); return 0; } @@ -1256,10 +1255,10 @@ try_next_bio: * to this packet. */ spin_lock(&pd->lock); - pkt_dbg(2, "looking for zone %llx\n", (unsigned long long)zone); + pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone); while ((node = pkt_rbtree_find(pd, zone)) != NULL) { bio = node->bio; - pkt_dbg(2, "found zone=%llx\n", + pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long)get_zone(bio->bi_sector, pd)); if (get_zone(bio->bi_sector, pd) != zone) break; @@ -1312,7 +1311,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset)) BUG(); } - pkt_dbg(2, "vcnt=%d\n", pkt->w_bio->bi_vcnt); + pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt); /* * Fill-in bvec with data from orig_bios. @@ -1323,7 +1322,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE); spin_unlock(&pkt->lock); - pkt_dbg(2, "Writing %d frames for zone %llx\n", + pkt_dbg(2, pd, "Writing %d frames for zone %llx\n", pkt->write_size, (unsigned long long)pkt->sector); if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) { @@ -1355,7 +1354,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data { int uptodate; - pkt_dbg(2, "pkt %d\n", pkt->id); + pkt_dbg(2, pd, "pkt %d\n", pkt->id); for (;;) { switch (pkt->state) { @@ -1394,7 +1393,7 @@ static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data if (pkt_start_recovery(pkt)) { pkt_start_write(pd, pkt); } else { - pkt_dbg(2, "No recovery possible\n"); + pkt_dbg(2, pd, "No recovery possible\n"); pkt_set_state(pkt, PACKET_FINISHED_STATE); } break; @@ -1415,8 +1414,6 @@ static void pkt_handle_packets(struct pktcdvd_device *pd) { struct packet_data *pkt, *next; - pkt_dbg(2, "\n"); - /* * Run state machine for active packets */ @@ -1498,7 +1495,7 @@ static int kcdrwd(void *foobar) if (PACKET_DEBUG > 1) { int states[PACKET_NUM_STATES]; pkt_count_states(pd, states); - pkt_dbg(2, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", + pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n", states[0], states[1], states[2], states[3], states[4], states[5]); } @@ -1509,9 +1506,9 @@ static int kcdrwd(void *foobar) min_sleep_time = pkt->sleep_time; } - pkt_dbg(2, "sleeping\n"); + pkt_dbg(2, pd, "sleeping\n"); residue = schedule_timeout(min_sleep_time); - pkt_dbg(2, "wake up\n"); + pkt_dbg(2, pd, "wake up\n"); /* make swsusp happy with our thread */ try_to_freeze(); @@ -1808,7 +1805,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) case 0x12: /* DVD-RAM */ return 1; default: - pkt_dbg(2, "Wrong disc profile (%x)\n", + pkt_dbg(2, pd, "Wrong disc profile (%x)\n", pd->mmc3_profile); return 0; } @@ -2124,7 +2121,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) struct request_sense sense; int ret; - pkt_dbg(2, "Performing OPC\n"); + pkt_dbg(2, pd, "Performing OPC\n"); init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE); cgc.sense = &sense; @@ -2142,12 +2139,12 @@ static int pkt_open_write(struct pktcdvd_device *pd) unsigned int write_speed, media_write_speed, read_speed; if ((ret = pkt_probe_settings(pd))) { - pkt_dbg(2, "%s failed probe\n", pd->name); + pkt_dbg(2, pd, "failed probe\n"); return ret; } if ((ret = pkt_set_write_settings(pd))) { - pkt_dbg(1, "%s failed saving write settings\n", pd->name); + pkt_dbg(1, pd, "failed saving write settings\n"); return -EIO; } @@ -2159,26 +2156,26 @@ static int pkt_open_write(struct pktcdvd_device *pd) case 0x13: /* DVD-RW */ case 0x1a: /* DVD+RW */ case 0x12: /* DVD-RAM */ - pkt_dbg(1, "write speed %ukB/s\n", write_speed); + pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed); break; default: if ((ret = pkt_media_speed(pd, &media_write_speed))) media_write_speed = 16; write_speed = min(write_speed, media_write_speed * 177); - pkt_dbg(1, "write speed %ux\n", write_speed / 176); + pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176); break; } read_speed = write_speed; if ((ret = pkt_set_speed(pd, write_speed, read_speed))) { - pkt_dbg(1, "%s couldn't set write speed\n", pd->name); + pkt_dbg(1, pd, "couldn't set write speed\n"); return -EIO; } pd->write_speed = write_speed; pd->read_speed = read_speed; if ((ret = pkt_perform_opc(pd))) { - pkt_dbg(1, "%s Optimum Power Calibration failed\n", pd->name); + pkt_dbg(1, pd, "Optimum Power Calibration failed\n"); } return 0; @@ -2255,7 +2252,7 @@ out: static void pkt_release_dev(struct pktcdvd_device *pd, int flush) { if (flush && pkt_flush_cache(pd)) - pkt_dbg(1, "%s not flushing cache\n", pd->name); + pkt_dbg(1, pd, "not flushing cache\n"); pkt_lock_door(pd, 0); @@ -2277,8 +2274,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) struct pktcdvd_device *pd = NULL; int ret; - pkt_dbg(2, "entering\n"); - mutex_lock(&pktcdvd_mutex); mutex_lock(&ctl_mutex); pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev)); @@ -2313,7 +2308,6 @@ static int pkt_open(struct block_device *bdev, fmode_t mode) out_dec: pd->refcnt--; out: - pkt_dbg(2, "failed (%d)\n", ret); mutex_unlock(&ctl_mutex); mutex_unlock(&pktcdvd_mutex); return ret; @@ -2394,7 +2388,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) blk_queue_bounce(q, &bio); zone = get_zone(bio->bi_sector, pd); - pkt_dbg(2, "start = %6llx stop = %6llx\n", + pkt_dbg(2, pd, "start = %6llx stop = %6llx\n", (unsigned long long)bio->bi_sector, (unsigned long long)bio_end_sector(bio)); @@ -2649,7 +2643,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } proc_create_data(pd->name, 0, pkt_proc, &pkt_proc_fops, pd); - pkt_dbg(1, "writer %s mapped to %s\n", pd->name, bdevname(bdev, b)); + pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b)); return 0; out_mem: @@ -2664,7 +2658,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, struct pktcdvd_device *pd = bdev->bd_disk->private_data; int ret; - pkt_dbg(2, "cmd %x, dev %d:%d\n", + pkt_dbg(2, pd, "cmd %x, dev %d:%d\n", cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); mutex_lock(&pktcdvd_mutex); @@ -2689,7 +2683,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, break; default: - pkt_dbg(2, "Unknown ioctl for %s (%x)\n", pd->name, cmd); + pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd); ret = -ENOTTY; } mutex_unlock(&pktcdvd_mutex); @@ -2838,7 +2832,7 @@ static int pkt_remove_dev(dev_t pkt_dev) break; } if (idx == MAX_WRITERS) { - pkt_dbg(1, "dev not setup\n"); + pkt_dbg(1, pd, "dev not setup\n"); ret = -ENXIO; goto out; } @@ -2858,7 +2852,7 @@ static int pkt_remove_dev(dev_t pkt_dev) blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY); remove_proc_entry(pd->name, pkt_proc); - pkt_dbg(1, "writer %s unmapped\n", pd->name); + pkt_dbg(1, pd, "writer unmapped\n"); del_gendisk(pd->disk); blk_cleanup_queue(pd->disk->queue); From fa63c0ab81f55eb5a016c1bcea04fe39c14afbaa Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:55 -0700 Subject: [PATCH 272/303] pktcdvd: add struct pktcdvd_device.name to pr_err logging where possible Add a new pkt_err macro to prefix the name to the logging output. Convert pr_err where there is a non-null struct pktcdvd_device. Includes improvements from Andy Shevchenko. Signed-off-by: Joe Perches Cc: Andy Shevchenko Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 44 ++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 5050c7abb6ee..3dbe42fe1bdd 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -71,6 +71,9 @@ #define DRIVER_NAME "pktcdvd" +#define pkt_err(pd, fmt, ...) \ + pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) + #define pkt_dbg(level, pd, fmt, ...) \ do { \ if (level == 2 && PACKET_DEBUG >= 2) \ @@ -938,7 +941,7 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que set_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; } else { - pr_err("cdrom max_phys_segments too small\n"); + pkt_err(pd, "cdrom max_phys_segments too small\n"); return -EIO; } } @@ -1743,7 +1746,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) /* * paranoia */ - pr_err("write mode wrong %d\n", wp->data_block_type); + pkt_err(pd, "write mode wrong %d\n", wp->data_block_type); return 1; } wp->packet_size = cpu_to_be32(pd->settings.size >> 2); @@ -1787,7 +1790,7 @@ static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti) if (ti->rt == 1 && ti->blank == 0) return 1; - pr_err("bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); + pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet); return 0; } @@ -1820,7 +1823,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) } if (di->disc_type != 0x20 && di->disc_type != 0) { - pr_err("wrong disc type (%x)\n", di->disc_type); + pkt_err(pd, "wrong disc type (%x)\n", di->disc_type); return 0; } @@ -1830,7 +1833,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) } if (di->border_status == PACKET_SESSION_RESERVED) { - pr_err("can't write to last track (reserved)\n"); + pkt_err(pd, "can't write to last track (reserved)\n"); return 0; } @@ -1855,7 +1858,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) memset(&ti, 0, sizeof(track_information)); if ((ret = pkt_get_disc_info(pd, &di))) { - pr_err("failed get_disc\n"); + pkt_err(pd, "failed get_disc\n"); return ret; } @@ -1866,12 +1869,12 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */ if ((ret = pkt_get_track_info(pd, track, 1, &ti))) { - pr_err("failed get_track\n"); + pkt_err(pd, "failed get_track\n"); return ret; } if (!pkt_writable_track(pd, &ti)) { - pr_err("can't write to this track\n"); + pkt_err(pd, "can't write to this track\n"); return -EROFS; } @@ -1885,7 +1888,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { - pr_err("packet size is too big\n"); + pkt_err(pd, "packet size is too big\n"); return -EROFS; } pd->settings.fp = ti.fp; @@ -1927,7 +1930,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) pd->settings.block_mode = PACKET_BLOCK_MODE2; break; default: - pr_err("unknown data mode\n"); + pkt_err(pd, "unknown data mode\n"); return -EROFS; } return 0; @@ -1961,7 +1964,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff)); ret = pkt_mode_select(pd, &cgc); if (ret) { - pr_err("write caching control failed\n"); + pkt_err(pd, "write caching control failed\n"); pkt_dump_sense(&cgc); } else if (!ret && set) pr_notice("enabled write caching on %s\n", pd->name); @@ -2200,7 +2203,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) goto out; if ((ret = pkt_get_last_written(pd, &lba))) { - pr_err("pkt_get_last_written failed\n"); + pkt_err(pd, "pkt_get_last_written failed\n"); goto out_putdev; } @@ -2230,7 +2233,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) if (write) { if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) { - pr_err("not enough memory for buffers\n"); + pkt_err(pd, "not enough memory for buffers\n"); ret = -ENOMEM; goto out_putdev; } @@ -2352,8 +2355,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) pd = q->queuedata; if (!pd) { - pr_err("%s incorrect request queue\n", - bdevname(bio->bi_bdev, b)); + pkt_err(pd, "%s incorrect request queue\n", + bdevname(bio->bi_bdev, b)); goto end_io; } @@ -2381,7 +2384,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) } if (!bio->bi_size || (bio->bi_size % CD_FRAMESIZE)) { - pr_err("wrong bio size\n"); + pkt_err(pd, "wrong bio size\n"); goto end_io; } @@ -2602,7 +2605,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) struct block_device *bdev; if (pd->pkt_dev == dev) { - pr_err("recursive setup not allowed\n"); + pkt_err(pd, "recursive setup not allowed\n"); return -EBUSY; } for (i = 0; i < MAX_WRITERS; i++) { @@ -2610,11 +2613,12 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - pr_err("%s already setup\n", bdevname(pd2->bdev, b)); + pkt_err(pd, "%s already setup\n", + bdevname(pd2->bdev, b)); return -EBUSY; } if (pd2->pkt_dev == dev) { - pr_err("can't chain pktcdvd devices\n"); + pkt_err(pd, "can't chain pktcdvd devices\n"); return -EBUSY; } } @@ -2637,7 +2641,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) atomic_set(&pd->cdrw.pending_bios, 0); pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name); if (IS_ERR(pd->cdrw.thread)) { - pr_err("can't start kernel thread\n"); + pkt_err(pd, "can't start kernel thread\n"); ret = -ENOMEM; goto out_mem; } From ca73dabc3d1df4aaacb242b6d4e39cd36618444f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:56 -0700 Subject: [PATCH 273/303] pktcdvd: convert pr_notice to pkt_notice Add a new pkt_notice macro to prefix the name to the logging output. Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 3dbe42fe1bdd..136a04c3a07a 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -73,6 +73,8 @@ #define pkt_err(pd, fmt, ...) \ pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_notice(pd, fmt, ...) \ + pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) #define pkt_dbg(level, pd, fmt, ...) \ do { \ @@ -1818,7 +1820,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) * but i'm not sure, should we leave this to user apps? probably. */ if (di->disc_type == 0xff) { - pr_notice("unknown disc - no track?\n"); + pkt_notice(pd, "unknown disc - no track?\n"); return 0; } @@ -1828,7 +1830,7 @@ static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di) } if (di->erasable == 0) { - pr_notice("disc not erasable\n"); + pkt_notice(pd, "disc not erasable\n"); return 0; } @@ -1884,7 +1886,7 @@ static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd) */ pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2; if (pd->settings.size == 0) { - pr_notice("detected zero packet size!\n"); + pkt_notice(pd, "detected zero packet size!\n"); return -ENXIO; } if (pd->settings.size > PACKET_MAX_SECTORS) { @@ -1967,7 +1969,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, pkt_err(pd, "write caching control failed\n"); pkt_dump_sense(&cgc); } else if (!ret && set) - pr_notice("enabled write caching on %s\n", pd->name); + pkt_notice(pd, "enabled write caching\n"); return ret; } @@ -2082,11 +2084,11 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, } if (!(buf[6] & 0x40)) { - pr_notice("disc type is not CD-RW\n"); + pkt_notice(pd, "disc type is not CD-RW\n"); return 1; } if (!(buf[6] & 0x4)) { - pr_notice("A1 values on media are not valid, maybe not CDRW?\n"); + pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n"); return 1; } @@ -2106,14 +2108,14 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, *speed = us_clv_to_speed[sp]; break; default: - pr_notice("unknown disc sub-type %d\n", st); + pkt_notice(pd, "unknown disc sub-type %d\n", st); return 1; } if (*speed) { pr_info("maximum media speed: %d\n", *speed); return 0; } else { - pr_notice("unknown speed %d for sub-type %d\n", sp, st); + pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); return 1; } } @@ -2378,8 +2380,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) } if (!test_bit(PACKET_WRITABLE, &pd->flags)) { - pr_notice("WRITE for ro device %s (%llu)\n", - pd->name, (unsigned long long)bio->bi_sector); + pkt_notice(pd, "WRITE for ro device (%llu)\n", + (unsigned long long)bio->bi_sector); goto end_io; } From 0c075d64df3aa9636a8700a710a2f2ada5e453a2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:57 -0700 Subject: [PATCH 274/303] pktcdvd: convert pr_info to pkt_info Add a new pkt_info macro to prefix the name to the logging output. Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 136a04c3a07a..1ceafb70fc16 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -75,6 +75,8 @@ pr_err("%s: " fmt, pd->name, ##__VA_ARGS__) #define pkt_notice(pd, fmt, ...) \ pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__) +#define pkt_info(pd, fmt, ...) \ + pr_info("%s: " fmt, pd->name, ##__VA_ARGS__) #define pkt_dbg(level, pd, fmt, ...) \ do { \ @@ -1561,10 +1563,10 @@ work_to_do: static void pkt_print_settings(struct pktcdvd_device *pd) { - pr_info("%s packets, %u blocks, Mode-%c disc\n", - pd->settings.fp ? "Fixed" : "Variable", - pd->settings.size >> 2, - pd->settings.block_mode == 8 ? '1' : '2'); + pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n", + pd->settings.fp ? "Fixed" : "Variable", + pd->settings.size >> 2, + pd->settings.block_mode == 8 ? '1' : '2'); } static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control) @@ -2112,7 +2114,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, return 1; } if (*speed) { - pr_info("maximum media speed: %d\n", *speed); + pkt_info(pd, "maximum media speed: %d\n", *speed); return 0; } else { pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st); @@ -2239,7 +2241,7 @@ static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write) ret = -ENOMEM; goto out_putdev; } - pr_info("%lukB available on disc\n", lba << 1); + pkt_info(pd, "%lukB available on disc\n", lba << 1); } return 0; From f3ded788bbd95af344a34ae40d1ab578cb76c7fb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:58 -0700 Subject: [PATCH 275/303] pktcdvd: add struct pktcdvd_device * to pkt_dump_sense() Allow the device name to be emitted with pkt_err when logging the sense data. Signed-off-by: Joe Perches Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 1ceafb70fc16..29a5194ab147 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -755,17 +755,18 @@ static const char *sense_key_string(__u8 index) * A generic sense dump / resolve mechanism should be implemented across * all ATAPI + SCSI devices. */ -static void pkt_dump_sense(struct packet_command *cgc) +static void pkt_dump_sense(struct pktcdvd_device *pd, + struct packet_command *cgc) { struct request_sense *sense = cgc->sense; if (sense) - pr_err("%*ph - sense %02x.%02x.%02x (%s)\n", - CDROM_PACKET_SIZE, cgc->cmd, - sense->sense_key, sense->asc, sense->ascq, - sense_key_string(sense->sense_key)); + pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n", + CDROM_PACKET_SIZE, cgc->cmd, + sense->sense_key, sense->asc, sense->ascq, + sense_key_string(sense->sense_key)); else - pr_err("%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); + pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd); } /* @@ -808,7 +809,7 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, cgc.cmd[5] = write_speed & 0xff; if ((ret = pkt_generic_packet(pd, &cgc))) - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1700,7 +1701,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ); cgc.sense = &sense; if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1715,7 +1716,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ); cgc.sense = &sense; if ((ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0))) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1757,7 +1758,7 @@ static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd) cgc.buflen = cgc.cmd[8] = size; if ((ret = pkt_mode_select(pd, &cgc))) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -1969,7 +1970,7 @@ static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd, ret = pkt_mode_select(pd, &cgc); if (ret) { pkt_err(pd, "write caching control failed\n"); - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); } else if (!ret && set) pkt_notice(pd, "enabled write caching\n"); return ret; @@ -2007,7 +2008,7 @@ static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd, sizeof(struct mode_page_header); ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0); if (ret) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } } @@ -2066,7 +2067,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, cgc.cmd[8] = 2; ret = pkt_generic_packet(pd, &cgc); if (ret) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } size = ((unsigned int) buf[0]<<8) + buf[1] + 2; @@ -2081,7 +2082,7 @@ static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd, cgc.cmd[8] = size; ret = pkt_generic_packet(pd, &cgc); if (ret) { - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } @@ -2136,7 +2137,7 @@ static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd) cgc.cmd[0] = GPCMD_SEND_OPC; cgc.cmd[1] = 1; if ((ret = pkt_generic_packet(pd, &cgc))) - pkt_dump_sense(&cgc); + pkt_dump_sense(pd, &cgc); return ret; } From 666dc7c90ab2531f7f0f13ef21afab8cd61478a9 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 11 Sep 2013 14:25:59 -0700 Subject: [PATCH 276/303] pktcdvd: fix defective misuses of pkt_ Fix thinkos where pkt_ needs a valid pktcdvd_device * and the pointer is known to be NULL. Signed-off-by: Joe Perches Reported-by: Dan Carpenter (go smatch!) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 29a5194ab147..56188475cfd3 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2360,8 +2360,8 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio) pd = q->queuedata; if (!pd) { - pkt_err(pd, "%s incorrect request queue\n", - bdevname(bio->bi_bdev, b)); + pr_err("%s incorrect request queue\n", + bdevname(bio->bi_bdev, b)); goto end_io; } @@ -2841,7 +2841,7 @@ static int pkt_remove_dev(dev_t pkt_dev) break; } if (idx == MAX_WRITERS) { - pkt_dbg(1, pd, "dev not setup\n"); + pr_debug("dev not setup\n"); ret = -ENXIO; goto out; } From b67fb086f38c67c0b940d9c2661f14b44c39e67a Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:26:00 -0700 Subject: [PATCH 277/303] drivers/pps/clients/pps-gpio.c: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure. Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/pps/clients/pps-gpio.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/pps/clients/pps-gpio.c b/drivers/pps/clients/pps-gpio.c index eae0eda9ff39..9966124ad988 100644 --- a/drivers/pps/clients/pps-gpio.c +++ b/drivers/pps/clients/pps-gpio.c @@ -184,7 +184,6 @@ static int pps_gpio_remove(struct platform_device *pdev) { struct pps_gpio_device_data *data = platform_get_drvdata(pdev); - platform_set_drvdata(pdev, NULL); pps_unregister_source(data->pps); dev_info(&pdev->dev, "removed IRQ %d as PPS source\n", data->irq); return 0; From ccf5a04f70cc2f64c145727de0f049cc67c43ece Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:26:01 -0700 Subject: [PATCH 278/303] drivers/memstick/host/rtsx_pci_ms.c: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure. Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Maxim Levitsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/memstick/host/rtsx_pci_ms.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/memstick/host/rtsx_pci_ms.c b/drivers/memstick/host/rtsx_pci_ms.c index cf8bd727dfc7..25f8f93decb6 100644 --- a/drivers/memstick/host/rtsx_pci_ms.c +++ b/drivers/memstick/host/rtsx_pci_ms.c @@ -612,8 +612,6 @@ static int rtsx_pci_ms_drv_remove(struct platform_device *pdev) memstick_remove_host(msh); memstick_free_host(msh); - platform_set_drvdata(pdev, NULL); - dev_dbg(&(pdev->dev), ": Realtek PCI-E Memstick controller has been removed\n"); From 0ab30494bc4f3bc1ea4659b7c5d97c5218554a63 Mon Sep 17 00:00:00 2001 From: Maxim Levitsky Date: Wed, 11 Sep 2013 14:26:02 -0700 Subject: [PATCH 279/303] memstick: add support for legacy memorysticks Based partially on MS standard spec quotes from Alex Dubov. As any code that works with user data this driver isn't recommended to use to write cards that contain valuable data. It tries its best though to avoid data corruption and possible damage to the card. Tested on MS DUO 64 MB card on Ricoh R592 card reader. Signed-off-by: Maxim Levitsky Cc: Valdis Kletnieks Cc: Jens Axboe Cc: Alex Dubov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 + drivers/memstick/core/Kconfig | 12 + drivers/memstick/core/Makefile | 2 +- drivers/memstick/core/ms_block.c | 2385 ++++++++++++++++++++++++++++++ drivers/memstick/core/ms_block.h | 290 ++++ 5 files changed, 2693 insertions(+), 1 deletion(-) create mode 100644 drivers/memstick/core/ms_block.c create mode 100644 drivers/memstick/core/ms_block.h diff --git a/MAINTAINERS b/MAINTAINERS index f8c41ae6c9a4..55969f1f626f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7749,6 +7749,11 @@ W: http://tifmxx.berlios.de/ S: Maintained F: drivers/memstick/host/tifm_ms.c +SONY MEMORYSTICK STANDARD SUPPORT +M: Maxim Levitsky +S: Maintained +F: drivers/memstick/core/ms_block.* + SOUND M: Jaroslav Kysela M: Takashi Iwai diff --git a/drivers/memstick/core/Kconfig b/drivers/memstick/core/Kconfig index 95f1814b5368..1d389491d5fd 100644 --- a/drivers/memstick/core/Kconfig +++ b/drivers/memstick/core/Kconfig @@ -24,3 +24,15 @@ config MSPRO_BLOCK support. This provides a block device driver, which you can use to mount the filesystem. Almost everyone wishing MemoryStick support should say Y or M here. + +config MS_BLOCK + tristate "MemoryStick Standard device driver" + depends on BLOCK + help + Say Y here to enable the MemoryStick Standard device driver + support. This provides a block device driver, which you can use + to mount the filesystem. + This driver works with old (bulky) MemoryStick and MemoryStick Duo + but not PRO. Say Y if you have such card. + Driver is new and not yet well tested, thus it can damage your card + (even permanently) diff --git a/drivers/memstick/core/Makefile b/drivers/memstick/core/Makefile index ecd029937738..0d7f90c0ff25 100644 --- a/drivers/memstick/core/Makefile +++ b/drivers/memstick/core/Makefile @@ -3,5 +3,5 @@ # obj-$(CONFIG_MEMSTICK) += memstick.o - +obj-$(CONFIG_MS_BLOCK) += ms_block.o obj-$(CONFIG_MSPRO_BLOCK) += mspro_block.o diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c new file mode 100644 index 000000000000..08e70232062f --- /dev/null +++ b/drivers/memstick/core/ms_block.c @@ -0,0 +1,2385 @@ +/* + * ms_block.c - Sony MemoryStick (legacy) storage support + + * Copyright (C) 2013 Maxim Levitsky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Minor portions of the driver were copied from mspro_block.c which is + * Copyright (C) 2007 Alex Dubov + * + */ +#define DRIVER_NAME "ms_block" +#define pr_fmt(fmt) DRIVER_NAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ms_block.h" + +static int debug; +static int cache_flush_timeout = 1000; +static bool verify_writes; + +/* + * Copies section of 'sg_from' starting from offset 'offset' and with length + * 'len' To another scatterlist of to_nents enties + */ +static size_t msb_sg_copy(struct scatterlist *sg_from, + struct scatterlist *sg_to, int to_nents, size_t offset, size_t len) +{ + size_t copied = 0; + + while (offset > 0) { + if (offset >= sg_from->length) { + if (sg_is_last(sg_from)) + return 0; + + offset -= sg_from->length; + sg_from = sg_next(sg_from); + continue; + } + + copied = min(len, sg_from->length - offset); + sg_set_page(sg_to, sg_page(sg_from), + copied, sg_from->offset + offset); + + len -= copied; + offset = 0; + + if (sg_is_last(sg_from) || !len) + goto out; + + sg_to = sg_next(sg_to); + to_nents--; + sg_from = sg_next(sg_from); + } + + while (len > sg_from->length && to_nents--) { + len -= sg_from->length; + copied += sg_from->length; + + sg_set_page(sg_to, sg_page(sg_from), + sg_from->length, sg_from->offset); + + if (sg_is_last(sg_from) || !len) + goto out; + + sg_from = sg_next(sg_from); + sg_to = sg_next(sg_to); + } + + if (len && to_nents) { + sg_set_page(sg_to, sg_page(sg_from), len, sg_from->offset); + copied += len; + } +out: + sg_mark_end(sg_to); + return copied; +} + +/* + * Compares section of 'sg' starting from offset 'offset' and with length 'len' + * to linear buffer of length 'len' at address 'buffer' + * Returns 0 if equal and -1 otherwice + */ +static int msb_sg_compare_to_buffer(struct scatterlist *sg, + size_t offset, u8 *buffer, size_t len) +{ + int retval = 0, cmplen; + struct sg_mapping_iter miter; + + sg_miter_start(&miter, sg, sg_nents(sg), + SG_MITER_ATOMIC | SG_MITER_FROM_SG); + + while (sg_miter_next(&miter) && len > 0) { + if (offset >= miter.length) { + offset -= miter.length; + continue; + } + + cmplen = min(miter.length - offset, len); + retval = memcmp(miter.addr + offset, buffer, cmplen) ? -1 : 0; + if (retval) + break; + + buffer += cmplen; + len -= cmplen; + offset = 0; + } + + if (!retval && len) + retval = -1; + + sg_miter_stop(&miter); + return retval; +} + + +/* Get zone at which block with logical address 'lba' lives + * Flash is broken into zones. + * Each zone consists of 512 eraseblocks, out of which in first + * zone 494 are used and 496 are for all following zones. + * Therefore zone #0 hosts blocks 0-493, zone #1 blocks 494-988, etc... +*/ +static int msb_get_zone_from_lba(int lba) +{ + if (lba < 494) + return 0; + return ((lba - 494) / 496) + 1; +} + +/* Get zone of physical block. Trivial */ +static int msb_get_zone_from_pba(int pba) +{ + return pba / MS_BLOCKS_IN_ZONE; +} + +/* Debug test to validate free block counts */ +static int msb_validate_used_block_bitmap(struct msb_data *msb) +{ + int total_free_blocks = 0; + int i; + + if (!debug) + return 0; + + for (i = 0; i < msb->zone_count; i++) + total_free_blocks += msb->free_block_count[i]; + + if (msb->block_count - bitmap_weight(msb->used_blocks_bitmap, + msb->block_count) == total_free_blocks) + return 0; + + pr_err("BUG: free block counts don't match the bitmap"); + msb->read_only = true; + return -EINVAL; +} + +/* Mark physical block as used */ +static void msb_mark_block_used(struct msb_data *msb, int pba) +{ + int zone = msb_get_zone_from_pba(pba); + + if (test_bit(pba, msb->used_blocks_bitmap)) { + pr_err( + "BUG: attempt to mark already used pba %d as used", pba); + msb->read_only = true; + return; + } + + if (msb_validate_used_block_bitmap(msb)) + return; + + /* No races because all IO is single threaded */ + __set_bit(pba, msb->used_blocks_bitmap); + msb->free_block_count[zone]--; +} + +/* Mark physical block as free */ +static void msb_mark_block_unused(struct msb_data *msb, int pba) +{ + int zone = msb_get_zone_from_pba(pba); + + if (!test_bit(pba, msb->used_blocks_bitmap)) { + pr_err("BUG: attempt to mark already unused pba %d as unused" , pba); + msb->read_only = true; + return; + } + + if (msb_validate_used_block_bitmap(msb)) + return; + + /* No races because all IO is single threaded */ + __clear_bit(pba, msb->used_blocks_bitmap); + msb->free_block_count[zone]++; +} + +/* Invalidate current register window */ +static void msb_invalidate_reg_window(struct msb_data *msb) +{ + msb->reg_addr.w_offset = offsetof(struct ms_register, id); + msb->reg_addr.w_length = sizeof(struct ms_id_register); + msb->reg_addr.r_offset = offsetof(struct ms_register, id); + msb->reg_addr.r_length = sizeof(struct ms_id_register); + msb->addr_valid = false; +} + +/* Start a state machine */ +static int msb_run_state_machine(struct msb_data *msb, int (*state_func) + (struct memstick_dev *card, struct memstick_request **req)) +{ + struct memstick_dev *card = msb->card; + + WARN_ON(msb->state != -1); + msb->int_polling = false; + msb->state = 0; + msb->exit_error = 0; + + memset(&card->current_mrq, 0, sizeof(card->current_mrq)); + + card->next_request = state_func; + memstick_new_req(card->host); + wait_for_completion(&card->mrq_complete); + + WARN_ON(msb->state != -1); + return msb->exit_error; +} + +/* State machines call that to exit */ +static int msb_exit_state_machine(struct msb_data *msb, int error) +{ + WARN_ON(msb->state == -1); + + msb->state = -1; + msb->exit_error = error; + msb->card->next_request = h_msb_default_bad; + + /* Invalidate reg window on errors */ + if (error) + msb_invalidate_reg_window(msb); + + complete(&msb->card->mrq_complete); + return -ENXIO; +} + +/* read INT register */ +static int msb_read_int_reg(struct msb_data *msb, long timeout) +{ + struct memstick_request *mrq = &msb->card->current_mrq; + + WARN_ON(msb->state == -1); + + if (!msb->int_polling) { + msb->int_timeout = jiffies + + msecs_to_jiffies(timeout == -1 ? 500 : timeout); + msb->int_polling = true; + } else if (time_after(jiffies, msb->int_timeout)) { + mrq->data[0] = MEMSTICK_INT_CMDNAK; + return 0; + } + + if ((msb->caps & MEMSTICK_CAP_AUTO_GET_INT) && + mrq->need_card_int && !mrq->error) { + mrq->data[0] = mrq->int_reg; + mrq->need_card_int = false; + return 0; + } else { + memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1); + return 1; + } +} + +/* Read a register */ +static int msb_read_regs(struct msb_data *msb, int offset, int len) +{ + struct memstick_request *req = &msb->card->current_mrq; + + if (msb->reg_addr.r_offset != offset || + msb->reg_addr.r_length != len || !msb->addr_valid) { + + msb->reg_addr.r_offset = offset; + msb->reg_addr.r_length = len; + msb->addr_valid = true; + + memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS, + &msb->reg_addr, sizeof(msb->reg_addr)); + return 0; + } + + memstick_init_req(req, MS_TPC_READ_REG, NULL, len); + return 1; +} + +/* Write a card register */ +static int msb_write_regs(struct msb_data *msb, int offset, int len, void *buf) +{ + struct memstick_request *req = &msb->card->current_mrq; + + if (msb->reg_addr.w_offset != offset || + msb->reg_addr.w_length != len || !msb->addr_valid) { + + msb->reg_addr.w_offset = offset; + msb->reg_addr.w_length = len; + msb->addr_valid = true; + + memstick_init_req(req, MS_TPC_SET_RW_REG_ADRS, + &msb->reg_addr, sizeof(msb->reg_addr)); + return 0; + } + + memstick_init_req(req, MS_TPC_WRITE_REG, buf, len); + return 1; +} + +/* Handler for absence of IO */ +static int h_msb_default_bad(struct memstick_dev *card, + struct memstick_request **mrq) +{ + return -ENXIO; +} + +/* + * This function is a handler for reads of one page from device. + * Writes output to msb->current_sg, takes sector address from msb->reg.param + * Can also be used to read extra data only. Set params accordintly. + */ +static int h_msb_read_page(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + struct scatterlist sg[2]; + u8 command, intreg; + + if (mrq->error) { + dbg("read_page, unknown error"); + return msb_exit_state_machine(msb, mrq->error); + } +again: + switch (msb->state) { + case MSB_RP_SEND_BLOCK_ADDRESS: + /* msb_write_regs sometimes "fails" because it needs to update + the reg window, and thus it returns request for that. + Then we stay in this state and retry */ + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + sizeof(struct ms_param_register), + (unsigned char *)&msb->regs.param)) + return 0; + + msb->state = MSB_RP_SEND_READ_COMMAND; + return 0; + + case MSB_RP_SEND_READ_COMMAND: + command = MS_CMD_BLOCK_READ; + memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1); + msb->state = MSB_RP_SEND_INT_REQ; + return 0; + + case MSB_RP_SEND_INT_REQ: + msb->state = MSB_RP_RECEIVE_INT_REQ_RESULT; + /* If dont actually need to send the int read request (only in + serial mode), then just fall through */ + if (msb_read_int_reg(msb, -1)) + return 0; + /* fallthrough */ + + case MSB_RP_RECEIVE_INT_REQ_RESULT: + intreg = mrq->data[0]; + msb->regs.status.interrupt = intreg; + + if (intreg & MEMSTICK_INT_CMDNAK) + return msb_exit_state_machine(msb, -EIO); + + if (!(intreg & MEMSTICK_INT_CED)) { + msb->state = MSB_RP_SEND_INT_REQ; + goto again; + } + + msb->int_polling = false; + msb->state = (intreg & MEMSTICK_INT_ERR) ? + MSB_RP_SEND_READ_STATUS_REG : MSB_RP_SEND_OOB_READ; + goto again; + + case MSB_RP_SEND_READ_STATUS_REG: + /* read the status register to understand source of the INT_ERR */ + if (!msb_read_regs(msb, + offsetof(struct ms_register, status), + sizeof(struct ms_status_register))) + return 0; + + msb->state = MSB_RP_RECEIVE_OOB_READ; + return 0; + + case MSB_RP_RECIVE_STATUS_REG: + msb->regs.status = *(struct ms_status_register *)mrq->data; + msb->state = MSB_RP_SEND_OOB_READ; + /* fallthrough */ + + case MSB_RP_SEND_OOB_READ: + if (!msb_read_regs(msb, + offsetof(struct ms_register, extra_data), + sizeof(struct ms_extra_data_register))) + return 0; + + msb->state = MSB_RP_RECEIVE_OOB_READ; + return 0; + + case MSB_RP_RECEIVE_OOB_READ: + msb->regs.extra_data = + *(struct ms_extra_data_register *) mrq->data; + msb->state = MSB_RP_SEND_READ_DATA; + /* fallthrough */ + + case MSB_RP_SEND_READ_DATA: + /* Skip that state if we only read the oob */ + if (msb->regs.param.cp == MEMSTICK_CP_EXTRA) { + msb->state = MSB_RP_RECEIVE_READ_DATA; + goto again; + } + + sg_init_table(sg, ARRAY_SIZE(sg)); + msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg), + msb->current_sg_offset, + msb->page_size); + + memstick_init_req_sg(mrq, MS_TPC_READ_LONG_DATA, sg); + msb->state = MSB_RP_RECEIVE_READ_DATA; + return 0; + + case MSB_RP_RECEIVE_READ_DATA: + if (!(msb->regs.status.interrupt & MEMSTICK_INT_ERR)) { + msb->current_sg_offset += msb->page_size; + return msb_exit_state_machine(msb, 0); + } + + if (msb->regs.status.status1 & MEMSTICK_UNCORR_ERROR) { + dbg("read_page: uncorrectable error"); + return msb_exit_state_machine(msb, -EBADMSG); + } + + if (msb->regs.status.status1 & MEMSTICK_CORR_ERROR) { + dbg("read_page: correctable error"); + msb->current_sg_offset += msb->page_size; + return msb_exit_state_machine(msb, -EUCLEAN); + } else { + dbg("read_page: INT error, but no status error bits"); + return msb_exit_state_machine(msb, -EIO); + } + } + + BUG(); +} + +/* + * Handler of writes of exactly one block. + * Takes address from msb->regs.param. + * Writes same extra data to blocks, also taken + * from msb->regs.extra + * Returns -EBADMSG if write fails due to uncorrectable error, or -EIO if + * device refuses to take the command or something else + */ +static int h_msb_write_block(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + struct scatterlist sg[2]; + u8 intreg, command; + + if (mrq->error) + return msb_exit_state_machine(msb, mrq->error); + +again: + switch (msb->state) { + + /* HACK: Jmicon handling of TPCs between 8 and + * sizeof(memstick_request.data) is broken due to hardware + * bug in PIO mode that is used for these TPCs + * Therefore split the write + */ + + case MSB_WB_SEND_WRITE_PARAMS: + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + sizeof(struct ms_param_register), + &msb->regs.param)) + return 0; + + msb->state = MSB_WB_SEND_WRITE_OOB; + return 0; + + case MSB_WB_SEND_WRITE_OOB: + if (!msb_write_regs(msb, + offsetof(struct ms_register, extra_data), + sizeof(struct ms_extra_data_register), + &msb->regs.extra_data)) + return 0; + msb->state = MSB_WB_SEND_WRITE_COMMAND; + return 0; + + + case MSB_WB_SEND_WRITE_COMMAND: + command = MS_CMD_BLOCK_WRITE; + memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1); + msb->state = MSB_WB_SEND_INT_REQ; + return 0; + + case MSB_WB_SEND_INT_REQ: + msb->state = MSB_WB_RECEIVE_INT_REQ; + if (msb_read_int_reg(msb, -1)) + return 0; + /* fallthrough */ + + case MSB_WB_RECEIVE_INT_REQ: + intreg = mrq->data[0]; + msb->regs.status.interrupt = intreg; + + /* errors mean out of here, and fast... */ + if (intreg & (MEMSTICK_INT_CMDNAK)) + return msb_exit_state_machine(msb, -EIO); + + if (intreg & MEMSTICK_INT_ERR) + return msb_exit_state_machine(msb, -EBADMSG); + + + /* for last page we need to poll CED */ + if (msb->current_page == msb->pages_in_block) { + if (intreg & MEMSTICK_INT_CED) + return msb_exit_state_machine(msb, 0); + msb->state = MSB_WB_SEND_INT_REQ; + goto again; + + } + + /* for non-last page we need BREQ before writing next chunk */ + if (!(intreg & MEMSTICK_INT_BREQ)) { + msb->state = MSB_WB_SEND_INT_REQ; + goto again; + } + + msb->int_polling = false; + msb->state = MSB_WB_SEND_WRITE_DATA; + /* fallthrough */ + + case MSB_WB_SEND_WRITE_DATA: + sg_init_table(sg, ARRAY_SIZE(sg)); + + if (msb_sg_copy(msb->current_sg, sg, ARRAY_SIZE(sg), + msb->current_sg_offset, + msb->page_size) < msb->page_size) + return msb_exit_state_machine(msb, -EIO); + + memstick_init_req_sg(mrq, MS_TPC_WRITE_LONG_DATA, sg); + mrq->need_card_int = 1; + msb->state = MSB_WB_RECEIVE_WRITE_CONFIRMATION; + return 0; + + case MSB_WB_RECEIVE_WRITE_CONFIRMATION: + msb->current_page++; + msb->current_sg_offset += msb->page_size; + msb->state = MSB_WB_SEND_INT_REQ; + goto again; + default: + BUG(); + } + + return 0; +} + +/* + * This function is used to send simple IO requests to device that consist + * of register write + command + */ +static int h_msb_send_command(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + u8 intreg; + + if (mrq->error) { + dbg("send_command: unknown error"); + return msb_exit_state_machine(msb, mrq->error); + } +again: + switch (msb->state) { + + /* HACK: see h_msb_write_block */ + case MSB_SC_SEND_WRITE_PARAMS: /* write param register*/ + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + sizeof(struct ms_param_register), + &msb->regs.param)) + return 0; + msb->state = MSB_SC_SEND_WRITE_OOB; + return 0; + + case MSB_SC_SEND_WRITE_OOB: + if (!msb->command_need_oob) { + msb->state = MSB_SC_SEND_COMMAND; + goto again; + } + + if (!msb_write_regs(msb, + offsetof(struct ms_register, extra_data), + sizeof(struct ms_extra_data_register), + &msb->regs.extra_data)) + return 0; + + msb->state = MSB_SC_SEND_COMMAND; + return 0; + + case MSB_SC_SEND_COMMAND: + memstick_init_req(mrq, MS_TPC_SET_CMD, &msb->command_value, 1); + msb->state = MSB_SC_SEND_INT_REQ; + return 0; + + case MSB_SC_SEND_INT_REQ: + msb->state = MSB_SC_RECEIVE_INT_REQ; + if (msb_read_int_reg(msb, -1)) + return 0; + /* fallthrough */ + + case MSB_SC_RECEIVE_INT_REQ: + intreg = mrq->data[0]; + + if (intreg & MEMSTICK_INT_CMDNAK) + return msb_exit_state_machine(msb, -EIO); + if (intreg & MEMSTICK_INT_ERR) + return msb_exit_state_machine(msb, -EBADMSG); + + if (!(intreg & MEMSTICK_INT_CED)) { + msb->state = MSB_SC_SEND_INT_REQ; + goto again; + } + + return msb_exit_state_machine(msb, 0); + } + + BUG(); +} + +/* Small handler for card reset */ +static int h_msb_reset(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + u8 command = MS_CMD_RESET; + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + + if (mrq->error) + return msb_exit_state_machine(msb, mrq->error); + + switch (msb->state) { + case MSB_RS_SEND: + memstick_init_req(mrq, MS_TPC_SET_CMD, &command, 1); + mrq->need_card_int = 0; + msb->state = MSB_RS_CONFIRM; + return 0; + case MSB_RS_CONFIRM: + return msb_exit_state_machine(msb, 0); + } + BUG(); +} + +/* This handler is used to do serial->parallel switch */ +static int h_msb_parallel_switch(struct memstick_dev *card, + struct memstick_request **out_mrq) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_request *mrq = *out_mrq = &card->current_mrq; + struct memstick_host *host = card->host; + + if (mrq->error) { + dbg("parallel_switch: error"); + msb->regs.param.system &= ~MEMSTICK_SYS_PAM; + return msb_exit_state_machine(msb, mrq->error); + } + + switch (msb->state) { + case MSB_PS_SEND_SWITCH_COMMAND: + /* Set the parallel interface on memstick side */ + msb->regs.param.system |= MEMSTICK_SYS_PAM; + + if (!msb_write_regs(msb, + offsetof(struct ms_register, param), + 1, + (unsigned char *)&msb->regs.param)) + return 0; + + msb->state = MSB_PS_SWICH_HOST; + return 0; + + case MSB_PS_SWICH_HOST: + /* Set parallel interface on our side + send a dummy request + to see if card responds */ + host->set_param(host, MEMSTICK_INTERFACE, MEMSTICK_PAR4); + memstick_init_req(mrq, MS_TPC_GET_INT, NULL, 1); + msb->state = MSB_PS_CONFIRM; + return 0; + + case MSB_PS_CONFIRM: + return msb_exit_state_machine(msb, 0); + } + + BUG(); +} + +static int msb_switch_to_parallel(struct msb_data *msb); + +/* Reset the card, to guard against hw errors beeing treated as bad blocks */ +static int msb_reset(struct msb_data *msb, bool full) +{ + + bool was_parallel = msb->regs.param.system & MEMSTICK_SYS_PAM; + struct memstick_dev *card = msb->card; + struct memstick_host *host = card->host; + int error; + + /* Reset the card */ + msb->regs.param.system = MEMSTICK_SYS_BAMD; + + if (full) { + error = host->set_param(host, + MEMSTICK_POWER, MEMSTICK_POWER_OFF); + if (error) + goto out_error; + + msb_invalidate_reg_window(msb); + + error = host->set_param(host, + MEMSTICK_POWER, MEMSTICK_POWER_ON); + if (error) + goto out_error; + + error = host->set_param(host, + MEMSTICK_INTERFACE, MEMSTICK_SERIAL); + if (error) { +out_error: + dbg("Failed to reset the host controller"); + msb->read_only = true; + return -EFAULT; + } + } + + error = msb_run_state_machine(msb, h_msb_reset); + if (error) { + dbg("Failed to reset the card"); + msb->read_only = true; + return -ENODEV; + } + + /* Set parallel mode */ + if (was_parallel) + msb_switch_to_parallel(msb); + return 0; +} + +/* Attempts to switch interface to parallel mode */ +static int msb_switch_to_parallel(struct msb_data *msb) +{ + int error; + + error = msb_run_state_machine(msb, h_msb_parallel_switch); + if (error) { + pr_err("Switch to parallel failed"); + msb->regs.param.system &= ~MEMSTICK_SYS_PAM; + msb_reset(msb, true); + return -EFAULT; + } + + msb->caps |= MEMSTICK_CAP_AUTO_GET_INT; + return 0; +} + +/* Changes overwrite flag on a page */ +static int msb_set_overwrite_flag(struct msb_data *msb, + u16 pba, u8 page, u8 flag) +{ + if (msb->read_only) + return -EROFS; + + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = page; + msb->regs.param.cp = MEMSTICK_CP_OVERWRITE; + msb->regs.extra_data.overwrite_flag = flag; + msb->command_value = MS_CMD_BLOCK_WRITE; + msb->command_need_oob = true; + + dbg_verbose("changing overwrite flag to %02x for sector %d, page %d", + flag, pba, page); + return msb_run_state_machine(msb, h_msb_send_command); +} + +static int msb_mark_bad(struct msb_data *msb, int pba) +{ + pr_notice("marking pba %d as bad", pba); + msb_reset(msb, true); + return msb_set_overwrite_flag( + msb, pba, 0, 0xFF & ~MEMSTICK_OVERWRITE_BKST); +} + +static int msb_mark_page_bad(struct msb_data *msb, int pba, int page) +{ + dbg("marking page %d of pba %d as bad", page, pba); + msb_reset(msb, true); + return msb_set_overwrite_flag(msb, + pba, page, ~MEMSTICK_OVERWRITE_PGST0); +} + +/* Erases one physical block */ +static int msb_erase_block(struct msb_data *msb, u16 pba) +{ + int error, try; + if (msb->read_only) + return -EROFS; + + dbg_verbose("erasing pba %d", pba); + + for (try = 1; try < 3; try++) { + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = 0; + msb->regs.param.cp = MEMSTICK_CP_BLOCK; + msb->command_value = MS_CMD_BLOCK_ERASE; + msb->command_need_oob = false; + + + error = msb_run_state_machine(msb, h_msb_send_command); + if (!error || msb_reset(msb, true)) + break; + } + + if (error) { + pr_err("erase failed, marking pba %d as bad", pba); + msb_mark_bad(msb, pba); + } + + dbg_verbose("erase success, marking pba %d as unused", pba); + msb_mark_block_unused(msb, pba); + __set_bit(pba, msb->erased_blocks_bitmap); + return error; +} + +/* Reads one page from device */ +static int msb_read_page(struct msb_data *msb, + u16 pba, u8 page, struct ms_extra_data_register *extra, + struct scatterlist *sg, int offset) +{ + int try, error; + + if (pba == MS_BLOCK_INVALID) { + unsigned long flags; + struct sg_mapping_iter miter; + size_t len = msb->page_size; + + dbg_verbose("read unmapped sector. returning 0xFF"); + + local_irq_save(flags); + sg_miter_start(&miter, sg, sg_nents(sg), + SG_MITER_ATOMIC | SG_MITER_TO_SG); + + while (sg_miter_next(&miter) && len > 0) { + + int chunklen; + + if (offset && offset >= miter.length) { + offset -= miter.length; + continue; + } + + chunklen = min(miter.length - offset, len); + memset(miter.addr + offset, 0xFF, chunklen); + len -= chunklen; + offset = 0; + } + + sg_miter_stop(&miter); + local_irq_restore(flags); + + if (offset) + return -EFAULT; + + if (extra) + memset(extra, 0xFF, sizeof(*extra)); + return 0; + } + + if (pba >= msb->block_count) { + pr_err("BUG: attempt to read beyond the end of the card at pba %d", pba); + return -EINVAL; + } + + for (try = 1; try < 3; try++) { + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = page; + msb->regs.param.cp = MEMSTICK_CP_PAGE; + + msb->current_sg = sg; + msb->current_sg_offset = offset; + error = msb_run_state_machine(msb, h_msb_read_page); + + + if (error == -EUCLEAN) { + pr_notice("correctable error on pba %d, page %d", + pba, page); + error = 0; + } + + if (!error && extra) + *extra = msb->regs.extra_data; + + if (!error || msb_reset(msb, true)) + break; + + } + + /* Mark bad pages */ + if (error == -EBADMSG) { + pr_err("uncorrectable error on read of pba %d, page %d", + pba, page); + + if (msb->regs.extra_data.overwrite_flag & + MEMSTICK_OVERWRITE_PGST0) + msb_mark_page_bad(msb, pba, page); + return -EBADMSG; + } + + if (error) + pr_err("read of pba %d, page %d failed with error %d", + pba, page, error); + return error; +} + +/* Reads oob of page only */ +static int msb_read_oob(struct msb_data *msb, u16 pba, u16 page, + struct ms_extra_data_register *extra) +{ + int error; + + BUG_ON(!extra); + msb->regs.param.block_address = cpu_to_be16(pba); + msb->regs.param.page_address = page; + msb->regs.param.cp = MEMSTICK_CP_EXTRA; + + if (pba > msb->block_count) { + pr_err("BUG: attempt to read beyond the end of card at pba %d", pba); + return -EINVAL; + } + + error = msb_run_state_machine(msb, h_msb_read_page); + *extra = msb->regs.extra_data; + + if (error == -EUCLEAN) { + pr_notice("correctable error on pba %d, page %d", + pba, page); + return 0; + } + + return error; +} + +/* Reads a block and compares it with data contained in scatterlist orig_sg */ +static int msb_verify_block(struct msb_data *msb, u16 pba, + struct scatterlist *orig_sg, int offset) +{ + struct scatterlist sg; + int page = 0, error; + + sg_init_one(&sg, msb->block_buffer, msb->block_size); + + while (page < msb->pages_in_block) { + + error = msb_read_page(msb, pba, page, + NULL, &sg, page * msb->page_size); + if (error) + return error; + page++; + } + + if (msb_sg_compare_to_buffer(orig_sg, offset, + msb->block_buffer, msb->block_size)) + return -EIO; + return 0; +} + +/* Writes exectly one block + oob */ +static int msb_write_block(struct msb_data *msb, + u16 pba, u32 lba, struct scatterlist *sg, int offset) +{ + int error, current_try = 1; + BUG_ON(sg->length < msb->page_size); + + if (msb->read_only) + return -EROFS; + + if (pba == MS_BLOCK_INVALID) { + pr_err( + "BUG: write: attempt to write MS_BLOCK_INVALID block"); + return -EINVAL; + } + + if (pba >= msb->block_count || lba >= msb->logical_block_count) { + pr_err( + "BUG: write: attempt to write beyond the end of device"); + return -EINVAL; + } + + if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) { + pr_err("BUG: write: lba zone mismatch"); + return -EINVAL; + } + + if (pba == msb->boot_block_locations[0] || + pba == msb->boot_block_locations[1]) { + pr_err("BUG: write: attempt to write to boot blocks!"); + return -EINVAL; + } + + while (1) { + + if (msb->read_only) + return -EROFS; + + msb->regs.param.cp = MEMSTICK_CP_BLOCK; + msb->regs.param.page_address = 0; + msb->regs.param.block_address = cpu_to_be16(pba); + + msb->regs.extra_data.management_flag = 0xFF; + msb->regs.extra_data.overwrite_flag = 0xF8; + msb->regs.extra_data.logical_address = cpu_to_be16(lba); + + msb->current_sg = sg; + msb->current_sg_offset = offset; + msb->current_page = 0; + + error = msb_run_state_machine(msb, h_msb_write_block); + + /* Sector we just wrote to is assumed erased since its pba + was erased. If it wasn't erased, write will succeed + and will just clear the bits that were set in the block + thus test that what we have written, + matches what we expect. + We do trust the blocks that we erased */ + if (!error && (verify_writes || + !test_bit(pba, msb->erased_blocks_bitmap))) + error = msb_verify_block(msb, pba, sg, offset); + + if (!error) + break; + + if (current_try > 1 || msb_reset(msb, true)) + break; + + pr_err("write failed, trying to erase the pba %d", pba); + error = msb_erase_block(msb, pba); + if (error) + break; + + current_try++; + } + return error; +} + +/* Finds a free block for write replacement */ +static u16 msb_get_free_block(struct msb_data *msb, int zone) +{ + u16 pos; + int pba = zone * MS_BLOCKS_IN_ZONE; + int i; + + get_random_bytes(&pos, sizeof(pos)); + + if (!msb->free_block_count[zone]) { + pr_err("NO free blocks in the zone %d, to use for a write, (media is WORN out) switching to RO mode", zone); + msb->read_only = true; + return MS_BLOCK_INVALID; + } + + pos %= msb->free_block_count[zone]; + + dbg_verbose("have %d choices for a free block, selected randomally: %d", + msb->free_block_count[zone], pos); + + pba = find_next_zero_bit(msb->used_blocks_bitmap, + msb->block_count, pba); + for (i = 0; i < pos; ++i) + pba = find_next_zero_bit(msb->used_blocks_bitmap, + msb->block_count, pba + 1); + + dbg_verbose("result of the free blocks scan: pba %d", pba); + + if (pba == msb->block_count || (msb_get_zone_from_pba(pba)) != zone) { + pr_err("BUG: cant get a free block"); + msb->read_only = true; + return MS_BLOCK_INVALID; + } + + msb_mark_block_used(msb, pba); + return pba; +} + +static int msb_update_block(struct msb_data *msb, u16 lba, + struct scatterlist *sg, int offset) +{ + u16 pba, new_pba; + int error, try; + + pba = msb->lba_to_pba_table[lba]; + dbg_verbose("start of a block update at lba %d, pba %d", lba, pba); + + if (pba != MS_BLOCK_INVALID) { + dbg_verbose("setting the update flag on the block"); + msb_set_overwrite_flag(msb, pba, 0, + 0xFF & ~MEMSTICK_OVERWRITE_UDST); + } + + for (try = 0; try < 3; try++) { + new_pba = msb_get_free_block(msb, + msb_get_zone_from_lba(lba)); + + if (new_pba == MS_BLOCK_INVALID) { + error = -EIO; + goto out; + } + + dbg_verbose("block update: writing updated block to the pba %d", + new_pba); + error = msb_write_block(msb, new_pba, lba, sg, offset); + if (error == -EBADMSG) { + msb_mark_bad(msb, new_pba); + continue; + } + + if (error) + goto out; + + dbg_verbose("block update: erasing the old block"); + msb_erase_block(msb, pba); + msb->lba_to_pba_table[lba] = new_pba; + return 0; + } +out: + if (error) { + pr_err("block update error after %d tries, switching to r/o mode", try); + msb->read_only = true; + } + return error; +} + +/* Converts endiannes in the boot block for easy use */ +static void msb_fix_boot_page_endianness(struct ms_boot_page *p) +{ + p->header.block_id = be16_to_cpu(p->header.block_id); + p->header.format_reserved = be16_to_cpu(p->header.format_reserved); + p->entry.disabled_block.start_addr + = be32_to_cpu(p->entry.disabled_block.start_addr); + p->entry.disabled_block.data_size + = be32_to_cpu(p->entry.disabled_block.data_size); + p->entry.cis_idi.start_addr + = be32_to_cpu(p->entry.cis_idi.start_addr); + p->entry.cis_idi.data_size + = be32_to_cpu(p->entry.cis_idi.data_size); + p->attr.block_size = be16_to_cpu(p->attr.block_size); + p->attr.number_of_blocks = be16_to_cpu(p->attr.number_of_blocks); + p->attr.number_of_effective_blocks + = be16_to_cpu(p->attr.number_of_effective_blocks); + p->attr.page_size = be16_to_cpu(p->attr.page_size); + p->attr.memory_manufacturer_code + = be16_to_cpu(p->attr.memory_manufacturer_code); + p->attr.memory_device_code = be16_to_cpu(p->attr.memory_device_code); + p->attr.implemented_capacity + = be16_to_cpu(p->attr.implemented_capacity); + p->attr.controller_number = be16_to_cpu(p->attr.controller_number); + p->attr.controller_function = be16_to_cpu(p->attr.controller_function); +} + +static int msb_read_boot_blocks(struct msb_data *msb) +{ + int pba = 0; + struct scatterlist sg; + struct ms_extra_data_register extra; + struct ms_boot_page *page; + + msb->boot_block_locations[0] = MS_BLOCK_INVALID; + msb->boot_block_locations[1] = MS_BLOCK_INVALID; + msb->boot_block_count = 0; + + dbg_verbose("Start of a scan for the boot blocks"); + + if (!msb->boot_page) { + page = kmalloc(sizeof(struct ms_boot_page)*2, GFP_KERNEL); + if (!page) + return -ENOMEM; + + msb->boot_page = page; + } else + page = msb->boot_page; + + msb->block_count = MS_BLOCK_MAX_BOOT_ADDR; + + for (pba = 0; pba < MS_BLOCK_MAX_BOOT_ADDR; pba++) { + + sg_init_one(&sg, page, sizeof(*page)); + if (msb_read_page(msb, pba, 0, &extra, &sg, 0)) { + dbg("boot scan: can't read pba %d", pba); + continue; + } + + if (extra.management_flag & MEMSTICK_MANAGEMENT_SYSFLG) { + dbg("managment flag doesn't indicate boot block %d", + pba); + continue; + } + + if (be16_to_cpu(page->header.block_id) != MS_BLOCK_BOOT_ID) { + dbg("the pba at %d doesn' contain boot block ID", pba); + continue; + } + + msb_fix_boot_page_endianness(page); + msb->boot_block_locations[msb->boot_block_count] = pba; + + page++; + msb->boot_block_count++; + + if (msb->boot_block_count == 2) + break; + } + + if (!msb->boot_block_count) { + pr_err("media doesn't contain master page, aborting"); + return -EIO; + } + + dbg_verbose("End of scan for boot blocks"); + return 0; +} + +static int msb_read_bad_block_table(struct msb_data *msb, int block_nr) +{ + struct ms_boot_page *boot_block; + struct scatterlist sg; + u16 *buffer = NULL; + int offset = 0; + int i, error = 0; + int data_size, data_offset, page, page_offset, size_to_read; + u16 pba; + + BUG_ON(block_nr > 1); + boot_block = &msb->boot_page[block_nr]; + pba = msb->boot_block_locations[block_nr]; + + if (msb->boot_block_locations[block_nr] == MS_BLOCK_INVALID) + return -EINVAL; + + data_size = boot_block->entry.disabled_block.data_size; + data_offset = sizeof(struct ms_boot_page) + + boot_block->entry.disabled_block.start_addr; + if (!data_size) + return 0; + + page = data_offset / msb->page_size; + page_offset = data_offset % msb->page_size; + size_to_read = + DIV_ROUND_UP(data_size + page_offset, msb->page_size) * + msb->page_size; + + dbg("reading bad block of boot block at pba %d, offset %d len %d", + pba, data_offset, data_size); + + buffer = kzalloc(size_to_read, GFP_KERNEL); + if (!buffer) + return -ENOMEM; + + /* Read the buffer */ + sg_init_one(&sg, buffer, size_to_read); + + while (offset < size_to_read) { + error = msb_read_page(msb, pba, page, NULL, &sg, offset); + if (error) + goto out; + + page++; + offset += msb->page_size; + + if (page == msb->pages_in_block) { + pr_err( + "bad block table extends beyond the boot block"); + break; + } + } + + /* Process the bad block table */ + for (i = page_offset; i < data_size / sizeof(u16); i++) { + + u16 bad_block = be16_to_cpu(buffer[i]); + + if (bad_block >= msb->block_count) { + dbg("bad block table contains invalid block %d", + bad_block); + continue; + } + + if (test_bit(bad_block, msb->used_blocks_bitmap)) { + dbg("duplicate bad block %d in the table", + bad_block); + continue; + } + + dbg("block %d is marked as factory bad", bad_block); + msb_mark_block_used(msb, bad_block); + } +out: + kfree(buffer); + return error; +} + +static int msb_ftl_initialize(struct msb_data *msb) +{ + int i; + + if (msb->ftl_initialized) + return 0; + + msb->zone_count = msb->block_count / MS_BLOCKS_IN_ZONE; + msb->logical_block_count = msb->zone_count * 496 - 2; + + msb->used_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL); + msb->erased_blocks_bitmap = kzalloc(msb->block_count / 8, GFP_KERNEL); + msb->lba_to_pba_table = + kmalloc(msb->logical_block_count * sizeof(u16), GFP_KERNEL); + + if (!msb->used_blocks_bitmap || !msb->lba_to_pba_table || + !msb->erased_blocks_bitmap) { + kfree(msb->used_blocks_bitmap); + kfree(msb->lba_to_pba_table); + kfree(msb->erased_blocks_bitmap); + return -ENOMEM; + } + + for (i = 0; i < msb->zone_count; i++) + msb->free_block_count[i] = MS_BLOCKS_IN_ZONE; + + memset(msb->lba_to_pba_table, MS_BLOCK_INVALID, + msb->logical_block_count * sizeof(u16)); + + dbg("initial FTL tables created. Zone count = %d, Logical block count = %d", + msb->zone_count, msb->logical_block_count); + + msb->ftl_initialized = true; + return 0; +} + +static int msb_ftl_scan(struct msb_data *msb) +{ + u16 pba, lba, other_block; + u8 overwrite_flag, managment_flag, other_overwrite_flag; + int error; + struct ms_extra_data_register extra; + u8 *overwrite_flags = kzalloc(msb->block_count, GFP_KERNEL); + + if (!overwrite_flags) + return -ENOMEM; + + dbg("Start of media scanning"); + for (pba = 0; pba < msb->block_count; pba++) { + + if (pba == msb->boot_block_locations[0] || + pba == msb->boot_block_locations[1]) { + dbg_verbose("pba %05d -> [boot block]", pba); + msb_mark_block_used(msb, pba); + continue; + } + + if (test_bit(pba, msb->used_blocks_bitmap)) { + dbg_verbose("pba %05d -> [factory bad]", pba); + continue; + } + + memset(&extra, 0, sizeof(extra)); + error = msb_read_oob(msb, pba, 0, &extra); + + /* can't trust the page if we can't read the oob */ + if (error == -EBADMSG) { + pr_notice( + "oob of pba %d damaged, will try to erase it", pba); + msb_mark_block_used(msb, pba); + msb_erase_block(msb, pba); + continue; + } else if (error) { + pr_err("unknown error %d on read of oob of pba %d - aborting", + error, pba); + + kfree(overwrite_flags); + return error; + } + + lba = be16_to_cpu(extra.logical_address); + managment_flag = extra.management_flag; + overwrite_flag = extra.overwrite_flag; + overwrite_flags[pba] = overwrite_flag; + + /* Skip bad blocks */ + if (!(overwrite_flag & MEMSTICK_OVERWRITE_BKST)) { + dbg("pba %05d -> [BAD]", pba); + msb_mark_block_used(msb, pba); + continue; + } + + /* Skip system/drm blocks */ + if ((managment_flag & MEMSTICK_MANAGMENT_FLAG_NORMAL) != + MEMSTICK_MANAGMENT_FLAG_NORMAL) { + dbg("pba %05d -> [reserved managment flag %02x]", + pba, managment_flag); + msb_mark_block_used(msb, pba); + continue; + } + + /* Erase temporary tables */ + if (!(managment_flag & MEMSTICK_MANAGEMENT_ATFLG)) { + dbg("pba %05d -> [temp table] - will erase", pba); + + msb_mark_block_used(msb, pba); + msb_erase_block(msb, pba); + continue; + } + + if (lba == MS_BLOCK_INVALID) { + dbg_verbose("pba %05d -> [free]", pba); + continue; + } + + msb_mark_block_used(msb, pba); + + /* Block has LBA not according to zoning*/ + if (msb_get_zone_from_lba(lba) != msb_get_zone_from_pba(pba)) { + pr_notice("pba %05d -> [bad lba %05d] - will erase", + pba, lba); + msb_erase_block(msb, pba); + continue; + } + + /* No collisions - great */ + if (msb->lba_to_pba_table[lba] == MS_BLOCK_INVALID) { + dbg_verbose("pba %05d -> [lba %05d]", pba, lba); + msb->lba_to_pba_table[lba] = pba; + continue; + } + + other_block = msb->lba_to_pba_table[lba]; + other_overwrite_flag = overwrite_flags[other_block]; + + pr_notice("Collision between pba %d and pba %d", + pba, other_block); + + if (!(overwrite_flag & MEMSTICK_OVERWRITE_UDST)) { + pr_notice("pba %d is marked as stable, use it", pba); + msb_erase_block(msb, other_block); + msb->lba_to_pba_table[lba] = pba; + continue; + } + + if (!(other_overwrite_flag & MEMSTICK_OVERWRITE_UDST)) { + pr_notice("pba %d is marked as stable, use it", + other_block); + msb_erase_block(msb, pba); + continue; + } + + pr_notice("collision between blocks %d and %d, without stable flag set on both, erasing pba %d", + pba, other_block, other_block); + + msb_erase_block(msb, other_block); + msb->lba_to_pba_table[lba] = pba; + } + + dbg("End of media scanning"); + kfree(overwrite_flags); + return 0; +} + +static void msb_cache_flush_timer(unsigned long data) +{ + struct msb_data *msb = (struct msb_data *)data; + msb->need_flush_cache = true; + queue_work(msb->io_queue, &msb->io_work); +} + + +static void msb_cache_discard(struct msb_data *msb) +{ + if (msb->cache_block_lba == MS_BLOCK_INVALID) + return; + + del_timer_sync(&msb->cache_flush_timer); + + dbg_verbose("Discarding the write cache"); + msb->cache_block_lba = MS_BLOCK_INVALID; + bitmap_zero(&msb->valid_cache_bitmap, msb->pages_in_block); +} + +static int msb_cache_init(struct msb_data *msb) +{ + setup_timer(&msb->cache_flush_timer, msb_cache_flush_timer, + (unsigned long)msb); + + if (!msb->cache) + msb->cache = kzalloc(msb->block_size, GFP_KERNEL); + if (!msb->cache) + return -ENOMEM; + + msb_cache_discard(msb); + return 0; +} + +static int msb_cache_flush(struct msb_data *msb) +{ + struct scatterlist sg; + struct ms_extra_data_register extra; + int page, offset, error; + u16 pba, lba; + + if (msb->read_only) + return -EROFS; + + if (msb->cache_block_lba == MS_BLOCK_INVALID) + return 0; + + lba = msb->cache_block_lba; + pba = msb->lba_to_pba_table[lba]; + + dbg_verbose("Flushing the write cache of pba %d (LBA %d)", + pba, msb->cache_block_lba); + + sg_init_one(&sg, msb->cache , msb->block_size); + + /* Read all missing pages in cache */ + for (page = 0; page < msb->pages_in_block; page++) { + + if (test_bit(page, &msb->valid_cache_bitmap)) + continue; + + offset = page * msb->page_size; + + dbg_verbose("reading non-present sector %d of cache block %d", + page, lba); + error = msb_read_page(msb, pba, page, &extra, &sg, offset); + + /* Bad pages are copied with 00 page status */ + if (error == -EBADMSG) { + pr_err("read error on sector %d, contents probably damaged", page); + continue; + } + + if (error) + return error; + + if ((extra.overwrite_flag & MEMSTICK_OV_PG_NORMAL) != + MEMSTICK_OV_PG_NORMAL) { + dbg("page %d is marked as bad", page); + continue; + } + + set_bit(page, &msb->valid_cache_bitmap); + } + + /* Write the cache now */ + error = msb_update_block(msb, msb->cache_block_lba, &sg, 0); + pba = msb->lba_to_pba_table[msb->cache_block_lba]; + + /* Mark invalid pages */ + if (!error) { + for (page = 0; page < msb->pages_in_block; page++) { + + if (test_bit(page, &msb->valid_cache_bitmap)) + continue; + + dbg("marking page %d as containing damaged data", + page); + msb_set_overwrite_flag(msb, + pba , page, 0xFF & ~MEMSTICK_OV_PG_NORMAL); + } + } + + msb_cache_discard(msb); + return error; +} + +static int msb_cache_write(struct msb_data *msb, int lba, + int page, bool add_to_cache_only, struct scatterlist *sg, int offset) +{ + int error; + struct scatterlist sg_tmp[10]; + + if (msb->read_only) + return -EROFS; + + if (msb->cache_block_lba == MS_BLOCK_INVALID || + lba != msb->cache_block_lba) + if (add_to_cache_only) + return 0; + + /* If we need to write different block */ + if (msb->cache_block_lba != MS_BLOCK_INVALID && + lba != msb->cache_block_lba) { + dbg_verbose("first flush the cache"); + error = msb_cache_flush(msb); + if (error) + return error; + } + + if (msb->cache_block_lba == MS_BLOCK_INVALID) { + msb->cache_block_lba = lba; + mod_timer(&msb->cache_flush_timer, + jiffies + msecs_to_jiffies(cache_flush_timeout)); + } + + dbg_verbose("Write of LBA %d page %d to cache ", lba, page); + + sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp)); + msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp), offset, msb->page_size); + + sg_copy_to_buffer(sg_tmp, sg_nents(sg_tmp), + msb->cache + page * msb->page_size, msb->page_size); + + set_bit(page, &msb->valid_cache_bitmap); + return 0; +} + +static int msb_cache_read(struct msb_data *msb, int lba, + int page, struct scatterlist *sg, int offset) +{ + int pba = msb->lba_to_pba_table[lba]; + struct scatterlist sg_tmp[10]; + int error = 0; + + if (lba == msb->cache_block_lba && + test_bit(page, &msb->valid_cache_bitmap)) { + + dbg_verbose("Read of LBA %d (pba %d) sector %d from cache", + lba, pba, page); + + sg_init_table(sg_tmp, ARRAY_SIZE(sg_tmp)); + msb_sg_copy(sg, sg_tmp, ARRAY_SIZE(sg_tmp), + offset, msb->page_size); + sg_copy_from_buffer(sg_tmp, sg_nents(sg_tmp), + msb->cache + msb->page_size * page, + msb->page_size); + } else { + dbg_verbose("Read of LBA %d (pba %d) sector %d from device", + lba, pba, page); + + error = msb_read_page(msb, pba, page, NULL, sg, offset); + if (error) + return error; + + msb_cache_write(msb, lba, page, true, sg, offset); + } + return error; +} + +/* Emulated geometry table + * This table content isn't that importaint, + * One could put here different values, providing that they still + * cover whole disk. + * 64 MB entry is what windows reports for my 64M memstick */ + +static const struct chs_entry chs_table[] = { +/* size sectors cylynders heads */ + { 4, 16, 247, 2 }, + { 8, 16, 495, 2 }, + { 16, 16, 495, 4 }, + { 32, 16, 991, 4 }, + { 64, 16, 991, 8 }, + {128, 16, 991, 16 }, + { 0 } +}; + +/* Load information about the card */ +static int msb_init_card(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_host *host = card->host; + struct ms_boot_page *boot_block; + int error = 0, i, raw_size_in_megs; + + msb->caps = 0; + + if (card->id.class >= MEMSTICK_CLASS_ROM && + card->id.class <= MEMSTICK_CLASS_ROM) + msb->read_only = true; + + msb->state = -1; + error = msb_reset(msb, false); + if (error) + return error; + + /* Due to a bug in Jmicron driver written by Alex Dubov, + its serial mode barely works, + so we switch to parallel mode right away */ + if (host->caps & MEMSTICK_CAP_PAR4) + msb_switch_to_parallel(msb); + + msb->page_size = sizeof(struct ms_boot_page); + + /* Read the boot page */ + error = msb_read_boot_blocks(msb); + if (error) + return -EIO; + + boot_block = &msb->boot_page[0]; + + /* Save intersting attributes from boot page */ + msb->block_count = boot_block->attr.number_of_blocks; + msb->page_size = boot_block->attr.page_size; + + msb->pages_in_block = boot_block->attr.block_size * 2; + msb->block_size = msb->page_size * msb->pages_in_block; + + if (msb->page_size > PAGE_SIZE) { + /* this isn't supported by linux at all, anyway*/ + dbg("device page %d size isn't supported", msb->page_size); + return -EINVAL; + } + + msb->block_buffer = kzalloc(msb->block_size, GFP_KERNEL); + if (!msb->block_buffer) + return -ENOMEM; + + raw_size_in_megs = (msb->block_size * msb->block_count) >> 20; + + for (i = 0; chs_table[i].size; i++) { + + if (chs_table[i].size != raw_size_in_megs) + continue; + + msb->geometry.cylinders = chs_table[i].cyl; + msb->geometry.heads = chs_table[i].head; + msb->geometry.sectors = chs_table[i].sec; + break; + } + + if (boot_block->attr.transfer_supporting == 1) + msb->caps |= MEMSTICK_CAP_PAR4; + + if (boot_block->attr.device_type & 0x03) + msb->read_only = true; + + dbg("Total block count = %d", msb->block_count); + dbg("Each block consists of %d pages", msb->pages_in_block); + dbg("Page size = %d bytes", msb->page_size); + dbg("Parallel mode supported: %d", !!(msb->caps & MEMSTICK_CAP_PAR4)); + dbg("Read only: %d", msb->read_only); + +#if 0 + /* Now we can switch the interface */ + if (host->caps & msb->caps & MEMSTICK_CAP_PAR4) + msb_switch_to_parallel(msb); +#endif + + error = msb_cache_init(msb); + if (error) + return error; + + error = msb_ftl_initialize(msb); + if (error) + return error; + + + /* Read the bad block table */ + error = msb_read_bad_block_table(msb, 0); + + if (error && error != -ENOMEM) { + dbg("failed to read bad block table from primary boot block, trying from backup"); + error = msb_read_bad_block_table(msb, 1); + } + + if (error) + return error; + + /* *drum roll* Scan the media */ + error = msb_ftl_scan(msb); + if (error) { + pr_err("Scan of media failed"); + return error; + } + + return 0; + +} + +static int msb_do_write_request(struct msb_data *msb, int lba, + int page, struct scatterlist *sg, size_t len, int *sucessfuly_written) +{ + int error = 0; + off_t offset = 0; + *sucessfuly_written = 0; + + while (offset < len) { + if (page == 0 && len - offset >= msb->block_size) { + + if (msb->cache_block_lba == lba) + msb_cache_discard(msb); + + dbg_verbose("Writing whole lba %d", lba); + error = msb_update_block(msb, lba, sg, offset); + if (error) + return error; + + offset += msb->block_size; + *sucessfuly_written += msb->block_size; + lba++; + continue; + } + + error = msb_cache_write(msb, lba, page, false, sg, offset); + if (error) + return error; + + offset += msb->page_size; + *sucessfuly_written += msb->page_size; + + page++; + if (page == msb->pages_in_block) { + page = 0; + lba++; + } + } + return 0; +} + +static int msb_do_read_request(struct msb_data *msb, int lba, + int page, struct scatterlist *sg, int len, int *sucessfuly_read) +{ + int error = 0; + int offset = 0; + *sucessfuly_read = 0; + + while (offset < len) { + + error = msb_cache_read(msb, lba, page, sg, offset); + if (error) + return error; + + offset += msb->page_size; + *sucessfuly_read += msb->page_size; + + page++; + if (page == msb->pages_in_block) { + page = 0; + lba++; + } + } + return 0; +} + +static void msb_io_work(struct work_struct *work) +{ + struct msb_data *msb = container_of(work, struct msb_data, io_work); + int page, error, len; + sector_t lba; + unsigned long flags; + struct scatterlist *sg = msb->prealloc_sg; + + dbg_verbose("IO: work started"); + + while (1) { + spin_lock_irqsave(&msb->q_lock, flags); + + if (msb->need_flush_cache) { + msb->need_flush_cache = false; + spin_unlock_irqrestore(&msb->q_lock, flags); + msb_cache_flush(msb); + continue; + } + + if (!msb->req) { + msb->req = blk_fetch_request(msb->queue); + if (!msb->req) { + dbg_verbose("IO: no more requests exiting"); + spin_unlock_irqrestore(&msb->q_lock, flags); + return; + } + } + + spin_unlock_irqrestore(&msb->q_lock, flags); + + /* If card was removed meanwhile */ + if (!msb->req) + return; + + /* process the request */ + dbg_verbose("IO: processing new request"); + blk_rq_map_sg(msb->queue, msb->req, sg); + + lba = blk_rq_pos(msb->req); + + sector_div(lba, msb->page_size / 512); + page = do_div(lba, msb->pages_in_block); + + if (rq_data_dir(msb->req) == READ) + error = msb_do_read_request(msb, lba, page, sg, + blk_rq_bytes(msb->req), &len); + else + error = msb_do_write_request(msb, lba, page, sg, + blk_rq_bytes(msb->req), &len); + + spin_lock_irqsave(&msb->q_lock, flags); + + if (len) + if (!__blk_end_request(msb->req, 0, len)) + msb->req = NULL; + + if (error && msb->req) { + dbg_verbose("IO: ending one sector of the request with error"); + if (!__blk_end_request(msb->req, error, msb->page_size)) + msb->req = NULL; + } + + if (msb->req) + dbg_verbose("IO: request still pending"); + + spin_unlock_irqrestore(&msb->q_lock, flags); + } +} + +static DEFINE_IDR(msb_disk_idr); /*set of used disk numbers */ +static DEFINE_MUTEX(msb_disk_lock); /* protects against races in open/release */ + +static int msb_bd_open(struct block_device *bdev, fmode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + struct msb_data *msb = disk->private_data; + + dbg_verbose("block device open"); + + mutex_lock(&msb_disk_lock); + + if (msb && msb->card) + msb->usage_count++; + + mutex_unlock(&msb_disk_lock); + return 0; +} + +static void msb_data_clear(struct msb_data *msb) +{ + kfree(msb->boot_page); + kfree(msb->used_blocks_bitmap); + kfree(msb->lba_to_pba_table); + kfree(msb->cache); + msb->card = NULL; +} + +static int msb_disk_release(struct gendisk *disk) +{ + struct msb_data *msb = disk->private_data; + + dbg_verbose("block device release"); + mutex_lock(&msb_disk_lock); + + if (msb) { + if (msb->usage_count) + msb->usage_count--; + + if (!msb->usage_count) { + disk->private_data = NULL; + idr_remove(&msb_disk_idr, msb->disk_id); + put_disk(disk); + kfree(msb); + } + } + mutex_unlock(&msb_disk_lock); + return 0; +} + +static void msb_bd_release(struct gendisk *disk, fmode_t mode) +{ + msb_disk_release(disk); +} + +static int msb_bd_getgeo(struct block_device *bdev, + struct hd_geometry *geo) +{ + struct msb_data *msb = bdev->bd_disk->private_data; + *geo = msb->geometry; + return 0; +} + +static int msb_prepare_req(struct request_queue *q, struct request *req) +{ + if (req->cmd_type != REQ_TYPE_FS && + req->cmd_type != REQ_TYPE_BLOCK_PC) { + blk_dump_rq_flags(req, "MS unsupported request"); + return BLKPREP_KILL; + } + req->cmd_flags |= REQ_DONTPREP; + return BLKPREP_OK; +} + +static void msb_submit_req(struct request_queue *q) +{ + struct memstick_dev *card = q->queuedata; + struct msb_data *msb = memstick_get_drvdata(card); + struct request *req = NULL; + + dbg_verbose("Submit request"); + + if (msb->card_dead) { + dbg("Refusing requests on removed card"); + + WARN_ON(!msb->io_queue_stopped); + + while ((req = blk_fetch_request(q)) != NULL) + __blk_end_request_all(req, -ENODEV); + return; + } + + if (msb->req) + return; + + if (!msb->io_queue_stopped) + queue_work(msb->io_queue, &msb->io_work); +} + +static int msb_check_card(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + return (msb->card_dead == 0); +} + +static void msb_stop(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + unsigned long flags; + + dbg("Stopping all msblock IO"); + + spin_lock_irqsave(&msb->q_lock, flags); + blk_stop_queue(msb->queue); + msb->io_queue_stopped = true; + spin_unlock_irqrestore(&msb->q_lock, flags); + + del_timer_sync(&msb->cache_flush_timer); + flush_workqueue(msb->io_queue); + + if (msb->req) { + spin_lock_irqsave(&msb->q_lock, flags); + blk_requeue_request(msb->queue, msb->req); + msb->req = NULL; + spin_unlock_irqrestore(&msb->q_lock, flags); + } + +} + +static void msb_start(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + unsigned long flags; + + dbg("Resuming IO from msblock"); + + msb_invalidate_reg_window(msb); + + spin_lock_irqsave(&msb->q_lock, flags); + if (!msb->io_queue_stopped || msb->card_dead) { + spin_unlock_irqrestore(&msb->q_lock, flags); + return; + } + spin_unlock_irqrestore(&msb->q_lock, flags); + + /* Kick cache flush anyway, its harmless */ + msb->need_flush_cache = true; + msb->io_queue_stopped = false; + + spin_lock_irqsave(&msb->q_lock, flags); + blk_start_queue(msb->queue); + spin_unlock_irqrestore(&msb->q_lock, flags); + + queue_work(msb->io_queue, &msb->io_work); + +} + +static const struct block_device_operations msb_bdops = { + .open = msb_bd_open, + .release = msb_bd_release, + .getgeo = msb_bd_getgeo, + .owner = THIS_MODULE +}; + +/* Registers the block device */ +static int msb_init_disk(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct memstick_host *host = card->host; + int rc; + u64 limit = BLK_BOUNCE_HIGH; + unsigned long capacity; + + if (host->dev.dma_mask && *(host->dev.dma_mask)) + limit = *(host->dev.dma_mask); + + mutex_lock(&msb_disk_lock); + msb->disk_id = idr_alloc(&msb_disk_idr, card, 0, 256, GFP_KERNEL); + mutex_unlock(&msb_disk_lock); + + if (msb->disk_id < 0) + return msb->disk_id; + + msb->disk = alloc_disk(0); + if (!msb->disk) { + rc = -ENOMEM; + goto out_release_id; + } + + msb->queue = blk_init_queue(msb_submit_req, &msb->q_lock); + if (!msb->queue) { + rc = -ENOMEM; + goto out_put_disk; + } + + msb->queue->queuedata = card; + blk_queue_prep_rq(msb->queue, msb_prepare_req); + + blk_queue_bounce_limit(msb->queue, limit); + blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES); + blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS); + blk_queue_max_segment_size(msb->queue, + MS_BLOCK_MAX_PAGES * msb->page_size); + blk_queue_logical_block_size(msb->queue, msb->page_size); + + sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id); + msb->disk->fops = &msb_bdops; + msb->disk->private_data = msb; + msb->disk->queue = msb->queue; + msb->disk->driverfs_dev = &card->dev; + msb->disk->flags |= GENHD_FL_EXT_DEVT; + + capacity = msb->pages_in_block * msb->logical_block_count; + capacity *= (msb->page_size / 512); + set_capacity(msb->disk, capacity); + dbg("Set total disk size to %lu sectors", capacity); + + msb->usage_count = 1; + msb->io_queue = alloc_ordered_workqueue("ms_block", WQ_MEM_RECLAIM); + INIT_WORK(&msb->io_work, msb_io_work); + sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1); + + if (msb->read_only) + set_disk_ro(msb->disk, 1); + + msb_start(card); + add_disk(msb->disk); + dbg("Disk added"); + return 0; + +out_put_disk: + put_disk(msb->disk); +out_release_id: + mutex_lock(&msb_disk_lock); + idr_remove(&msb_disk_idr, msb->disk_id); + mutex_unlock(&msb_disk_lock); + return rc; +} + +static int msb_probe(struct memstick_dev *card) +{ + struct msb_data *msb; + int rc = 0; + + msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL); + if (!msb) + return -ENOMEM; + memstick_set_drvdata(card, msb); + msb->card = card; + spin_lock_init(&msb->q_lock); + + rc = msb_init_card(card); + if (rc) + goto out_free; + + rc = msb_init_disk(card); + if (!rc) { + card->check = msb_check_card; + card->stop = msb_stop; + card->start = msb_start; + return 0; + } +out_free: + memstick_set_drvdata(card, NULL); + msb_data_clear(msb); + kfree(msb); + return rc; +} + +static void msb_remove(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + unsigned long flags; + + if (!msb->io_queue_stopped) + msb_stop(card); + + dbg("Removing the disk device"); + + /* Take care of unhandled + new requests from now on */ + spin_lock_irqsave(&msb->q_lock, flags); + msb->card_dead = true; + blk_start_queue(msb->queue); + spin_unlock_irqrestore(&msb->q_lock, flags); + + /* Remove the disk */ + del_gendisk(msb->disk); + blk_cleanup_queue(msb->queue); + msb->queue = NULL; + + mutex_lock(&msb_disk_lock); + msb_data_clear(msb); + mutex_unlock(&msb_disk_lock); + + msb_disk_release(msb->disk); + memstick_set_drvdata(card, NULL); +} + +#ifdef CONFIG_PM + +static int msb_suspend(struct memstick_dev *card, pm_message_t state) +{ + msb_stop(card); + return 0; +} + +static int msb_resume(struct memstick_dev *card) +{ + struct msb_data *msb = memstick_get_drvdata(card); + struct msb_data *new_msb = NULL; + bool card_dead = true; + +#ifndef CONFIG_MEMSTICK_UNSAFE_RESUME + msb->card_dead = true; + return 0; +#endif + mutex_lock(&card->host->lock); + + new_msb = kzalloc(sizeof(struct msb_data), GFP_KERNEL); + if (!new_msb) + goto out; + + new_msb->card = card; + memstick_set_drvdata(card, new_msb); + spin_lock_init(&new_msb->q_lock); + sg_init_table(msb->prealloc_sg, MS_BLOCK_MAX_SEGS+1); + + if (msb_init_card(card)) + goto out; + + if (msb->block_size != new_msb->block_size) + goto out; + + if (memcmp(msb->boot_page, new_msb->boot_page, + sizeof(struct ms_boot_page))) + goto out; + + if (msb->logical_block_count != new_msb->logical_block_count || + memcmp(msb->lba_to_pba_table, new_msb->lba_to_pba_table, + msb->logical_block_count)) + goto out; + + if (msb->block_count != new_msb->block_count || + memcmp(msb->used_blocks_bitmap, new_msb->used_blocks_bitmap, + msb->block_count / 8)) + goto out; + + card_dead = false; +out: + if (card_dead) + dbg("Card was removed/replaced during suspend"); + + msb->card_dead = card_dead; + memstick_set_drvdata(card, msb); + + if (new_msb) { + msb_data_clear(new_msb); + kfree(new_msb); + } + + msb_start(card); + mutex_unlock(&card->host->lock); + return 0; +} +#else + +#define msb_suspend NULL +#define msb_resume NULL + +#endif /* CONFIG_PM */ + +static struct memstick_device_id msb_id_tbl[] = { + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_FLASH}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_ROM}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_RO}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_LEGACY, MEMSTICK_CATEGORY_STORAGE, + MEMSTICK_CLASS_WP}, + + {MEMSTICK_MATCH_ALL, MEMSTICK_TYPE_DUO, MEMSTICK_CATEGORY_STORAGE_DUO, + MEMSTICK_CLASS_DUO}, + {} +}; +MODULE_DEVICE_TABLE(memstick, msb_id_tbl); + + +static struct memstick_driver msb_driver = { + .driver = { + .name = DRIVER_NAME, + .owner = THIS_MODULE + }, + .id_table = msb_id_tbl, + .probe = msb_probe, + .remove = msb_remove, + .suspend = msb_suspend, + .resume = msb_resume +}; + +static int major; + +static int __init msb_init(void) +{ + int rc = register_blkdev(0, DRIVER_NAME); + + if (rc < 0) { + pr_err("failed to register major (error %d)\n", rc); + return rc; + } + + major = rc; + rc = memstick_register_driver(&msb_driver); + if (rc) { + unregister_blkdev(major, DRIVER_NAME); + pr_err("failed to register memstick driver (error %d)\n", rc); + } + + return rc; +} + +static void __exit msb_exit(void) +{ + memstick_unregister_driver(&msb_driver); + unregister_blkdev(major, DRIVER_NAME); + idr_destroy(&msb_disk_idr); +} + +module_init(msb_init); +module_exit(msb_exit); + +module_param(cache_flush_timeout, int, S_IRUGO); +MODULE_PARM_DESC(cache_flush_timeout, + "Cache flush timeout in msec (1000 default)"); +module_param(debug, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(debug, "Debug level (0-2)"); + +module_param(verify_writes, bool, S_IRUGO); +MODULE_PARM_DESC(verify_writes, "Read back and check all data that is written"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Maxim Levitsky"); +MODULE_DESCRIPTION("Sony MemoryStick block device driver"); diff --git a/drivers/memstick/core/ms_block.h b/drivers/memstick/core/ms_block.h new file mode 100644 index 000000000000..96e637550988 --- /dev/null +++ b/drivers/memstick/core/ms_block.h @@ -0,0 +1,290 @@ +/* + * ms_block.h - Sony MemoryStick (legacy) storage support + + * Copyright (C) 2013 Maxim Levitsky + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Minor portions of the driver are copied from mspro_block.c which is + * Copyright (C) 2007 Alex Dubov + * + * Also ms structures were copied from old broken driver by same author + * These probably come from MS spec + * + */ + +#ifndef MS_BLOCK_NEW_H +#define MS_BLOCK_NEW_H + +#define MS_BLOCK_MAX_SEGS 32 +#define MS_BLOCK_MAX_PAGES ((2 << 16) - 1) + +#define MS_BLOCK_MAX_BOOT_ADDR 0x000c +#define MS_BLOCK_BOOT_ID 0x0001 +#define MS_BLOCK_INVALID 0xffff +#define MS_MAX_ZONES 16 +#define MS_BLOCKS_IN_ZONE 512 + +#define MS_BLOCK_MAP_LINE_SZ 16 +#define MS_BLOCK_PART_SHIFT 3 + + +#define MEMSTICK_UNCORR_ERROR (MEMSTICK_STATUS1_UCFG | \ + MEMSTICK_STATUS1_UCEX | MEMSTICK_STATUS1_UCDT) + +#define MEMSTICK_CORR_ERROR (MEMSTICK_STATUS1_FGER | MEMSTICK_STATUS1_EXER | \ + MEMSTICK_STATUS1_DTER) + +#define MEMSTICK_INT_ERROR (MEMSTICK_INT_CMDNAK | MEMSTICK_INT_ERR) + +#define MEMSTICK_OVERWRITE_FLAG_NORMAL \ + (MEMSTICK_OVERWRITE_PGST1 | \ + MEMSTICK_OVERWRITE_PGST0 | \ + MEMSTICK_OVERWRITE_BKST) + +#define MEMSTICK_OV_PG_NORMAL \ + (MEMSTICK_OVERWRITE_PGST1 | MEMSTICK_OVERWRITE_PGST0) + +#define MEMSTICK_MANAGMENT_FLAG_NORMAL \ + (MEMSTICK_MANAGEMENT_SYSFLG | \ + MEMSTICK_MANAGEMENT_SCMS1 | \ + MEMSTICK_MANAGEMENT_SCMS0) \ + +struct ms_boot_header { + unsigned short block_id; + unsigned short format_reserved; + unsigned char reserved0[184]; + unsigned char data_entry; + unsigned char reserved1[179]; +} __packed; + + +struct ms_system_item { + unsigned int start_addr; + unsigned int data_size; + unsigned char data_type_id; + unsigned char reserved[3]; +} __packed; + +struct ms_system_entry { + struct ms_system_item disabled_block; + struct ms_system_item cis_idi; + unsigned char reserved[24]; +} __packed; + +struct ms_boot_attr_info { + unsigned char memorystick_class; + unsigned char format_unique_value1; + unsigned short block_size; + unsigned short number_of_blocks; + unsigned short number_of_effective_blocks; + unsigned short page_size; + unsigned char extra_data_size; + unsigned char format_unique_value2; + unsigned char assembly_time[8]; + unsigned char format_unique_value3; + unsigned char serial_number[3]; + unsigned char assembly_manufacturer_code; + unsigned char assembly_model_code[3]; + unsigned short memory_manufacturer_code; + unsigned short memory_device_code; + unsigned short implemented_capacity; + unsigned char format_unique_value4[2]; + unsigned char vcc; + unsigned char vpp; + unsigned short controller_number; + unsigned short controller_function; + unsigned char reserved0[9]; + unsigned char transfer_supporting; + unsigned short format_unique_value5; + unsigned char format_type; + unsigned char memorystick_application; + unsigned char device_type; + unsigned char reserved1[22]; + unsigned char format_uniqure_value6[2]; + unsigned char reserved2[15]; +} __packed; + +struct ms_cis_idi { + unsigned short general_config; + unsigned short logical_cylinders; + unsigned short reserved0; + unsigned short logical_heads; + unsigned short track_size; + unsigned short page_size; + unsigned short pages_per_track; + unsigned short msw; + unsigned short lsw; + unsigned short reserved1; + unsigned char serial_number[20]; + unsigned short buffer_type; + unsigned short buffer_size_increments; + unsigned short long_command_ecc; + unsigned char firmware_version[28]; + unsigned char model_name[18]; + unsigned short reserved2[5]; + unsigned short pio_mode_number; + unsigned short dma_mode_number; + unsigned short field_validity; + unsigned short current_logical_cylinders; + unsigned short current_logical_heads; + unsigned short current_pages_per_track; + unsigned int current_page_capacity; + unsigned short mutiple_page_setting; + unsigned int addressable_pages; + unsigned short single_word_dma; + unsigned short multi_word_dma; + unsigned char reserved3[128]; +} __packed; + + +struct ms_boot_page { + struct ms_boot_header header; + struct ms_system_entry entry; + struct ms_boot_attr_info attr; +} __packed; + +struct msb_data { + unsigned int usage_count; + struct memstick_dev *card; + struct gendisk *disk; + struct request_queue *queue; + spinlock_t q_lock; + struct hd_geometry geometry; + struct attribute_group attr_group; + struct request *req; + int caps; + int disk_id; + + /* IO */ + struct workqueue_struct *io_queue; + bool io_queue_stopped; + struct work_struct io_work; + bool card_dead; + + /* Media properties */ + struct ms_boot_page *boot_page; + u16 boot_block_locations[2]; + int boot_block_count; + + bool read_only; + unsigned short page_size; + int block_size; + int pages_in_block; + int zone_count; + int block_count; + int logical_block_count; + + /* FTL tables */ + unsigned long *used_blocks_bitmap; + unsigned long *erased_blocks_bitmap; + u16 *lba_to_pba_table; + int free_block_count[MS_MAX_ZONES]; + bool ftl_initialized; + + /* Cache */ + unsigned char *cache; + unsigned long valid_cache_bitmap; + int cache_block_lba; + bool need_flush_cache; + struct timer_list cache_flush_timer; + + /* Preallocated buffers */ + unsigned char *block_buffer; + struct scatterlist prealloc_sg[MS_BLOCK_MAX_SEGS+1]; + + + /* handler's local data */ + struct ms_register_addr reg_addr; + bool addr_valid; + + u8 command_value; + bool command_need_oob; + struct scatterlist *current_sg; + int current_sg_offset; + + struct ms_register regs; + int current_page; + + int state; + int exit_error; + bool int_polling; + unsigned long int_timeout; + +}; + +enum msb_readpage_states { + MSB_RP_SEND_BLOCK_ADDRESS = 0, + MSB_RP_SEND_READ_COMMAND, + + MSB_RP_SEND_INT_REQ, + MSB_RP_RECEIVE_INT_REQ_RESULT, + + MSB_RP_SEND_READ_STATUS_REG, + MSB_RP_RECIVE_STATUS_REG, + + MSB_RP_SEND_OOB_READ, + MSB_RP_RECEIVE_OOB_READ, + + MSB_RP_SEND_READ_DATA, + MSB_RP_RECEIVE_READ_DATA, +}; + +enum msb_write_block_states { + MSB_WB_SEND_WRITE_PARAMS = 0, + MSB_WB_SEND_WRITE_OOB, + MSB_WB_SEND_WRITE_COMMAND, + + MSB_WB_SEND_INT_REQ, + MSB_WB_RECEIVE_INT_REQ, + + MSB_WB_SEND_WRITE_DATA, + MSB_WB_RECEIVE_WRITE_CONFIRMATION, +}; + +enum msb_send_command_states { + MSB_SC_SEND_WRITE_PARAMS, + MSB_SC_SEND_WRITE_OOB, + MSB_SC_SEND_COMMAND, + + MSB_SC_SEND_INT_REQ, + MSB_SC_RECEIVE_INT_REQ, + +}; + +enum msb_reset_states { + MSB_RS_SEND, + MSB_RS_CONFIRM, +}; + +enum msb_par_switch_states { + MSB_PS_SEND_SWITCH_COMMAND, + MSB_PS_SWICH_HOST, + MSB_PS_CONFIRM, +}; + +struct chs_entry { + unsigned long size; + unsigned char sec; + unsigned short cyl; + unsigned char head; +}; + +static int msb_reset(struct msb_data *msb, bool full); + +static int h_msb_default_bad(struct memstick_dev *card, + struct memstick_request **mrq); + +#define __dbg(level, format, ...) \ + do { \ + if (debug >= level) \ + pr_err(format "\n", ## __VA_ARGS__); \ + } while (0) + + +#define dbg(format, ...) __dbg(1, format, ## __VA_ARGS__) +#define dbg_verbose(format, ...) __dbg(2, format, ## __VA_ARGS__) + +#endif From bf4228f0ef662f11252f8cde7fa92979e4ac6b69 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:26:03 -0700 Subject: [PATCH 280/303] drivers/w1/w1.c: replace strict_strtol() with kstrtol() The usage of strict_strtol() is not preferred, because strict_strtol() is obsolete. Thus, kstrtol() should be used. Signed-off-by: Jingoo Han Cc: Evgeniy Polyakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/w1.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c index 22013ca2119c..c7c64f18773d 100644 --- a/drivers/w1/w1.c +++ b/drivers/w1/w1.c @@ -234,9 +234,11 @@ static ssize_t w1_master_attribute_store_search(struct device * dev, { long tmp; struct w1_master *md = dev_to_w1_master(dev); + int ret; - if (strict_strtol(buf, 0, &tmp) == -EINVAL) - return -EINVAL; + ret = kstrtol(buf, 0, &tmp); + if (ret) + return ret; mutex_lock(&md->mutex); md->search_count = tmp; @@ -266,9 +268,11 @@ static ssize_t w1_master_attribute_store_pullup(struct device *dev, { long tmp; struct w1_master *md = dev_to_w1_master(dev); + int ret; - if (strict_strtol(buf, 0, &tmp) == -EINVAL) - return -EINVAL; + ret = kstrtol(buf, 0, &tmp); + if (ret) + return ret; mutex_lock(&md->mutex); md->enable_pullup = tmp; From 4b39248365e09fb8268b6fecd1704907ffc3d980 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Wed, 11 Sep 2013 14:26:04 -0700 Subject: [PATCH 281/303] drivers/w1/masters/mxc_w1.c: remove unnecessary platform_set_drvdata() The driver core clears the driver data to NULL after device_release or on probe failure. Thus, it is not needed to manually clear the device driver data to NULL. Signed-off-by: Jingoo Han Cc: Evgeniy Polyakov Cc: Greg KH Acked-by: Shawn Guo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/w1/masters/mxc_w1.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/w1/masters/mxc_w1.c b/drivers/w1/masters/mxc_w1.c index 47e12cfc2a57..15c7251b0556 100644 --- a/drivers/w1/masters/mxc_w1.c +++ b/drivers/w1/masters/mxc_w1.c @@ -152,8 +152,6 @@ static int mxc_w1_remove(struct platform_device *pdev) clk_disable_unprepare(mdev->clk); - platform_set_drvdata(pdev, NULL); - return 0; } From 5e4c0d974139a98741b829b27cf38dc8f9284490 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Sep 2013 14:26:05 -0700 Subject: [PATCH 282/303] lib/radix-tree.c: make radix_tree_node_alloc() work correctly within interrupt With users of radix_tree_preload() run from interrupt (block/blk-ioc.c is one such possible user), the following race can happen: radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; ... radix_tree_preload() ... radix_tree_insert() radix_tree_node_alloc() if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; And we give out one radix tree node twice. That clearly results in radix tree corruption with different results (usually OOPS) depending on which two users of radix tree race. We fix the problem by making radix_tree_node_alloc() always allocate fresh radix tree nodes when in interrupt. Using preloading when in interrupt doesn't make sense since all the allocations have to be atomic anyway and we cannot steal nodes from process-context users because some users rely on radix_tree_insert() succeeding after radix_tree_preload(). in_interrupt() check is somewhat ugly but we cannot simply key off passed gfp_mask as that is acquired from root_gfp_mask() and thus the same for all preload users. Another part of the fix is to avoid node preallocation in radix_tree_preload() when passed gfp_mask doesn't allow waiting. Again, preallocation in such case doesn't make sense and when preallocation would happen in interrupt we could possibly leak some allocated nodes. However, some users of radix_tree_preload() require following radix_tree_insert() to succeed. To avoid unexpected effects for these users, radix_tree_preload() only warns if passed gfp mask doesn't allow waiting and we provide a new function radix_tree_maybe_preload() for those users which get different gfp mask from different call sites and which are prepared to handle radix_tree_insert() failure. Signed-off-by: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-ioc.c | 2 +- fs/fscache/page.c | 2 +- include/linux/radix-tree.h | 1 + lib/radix-tree.c | 41 ++++++++++++++++++++++++++++++++++++-- mm/filemap.c | 2 +- mm/shmem.c | 2 +- mm/swap_state.c | 4 ++-- 7 files changed, 46 insertions(+), 8 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 4464c823cff2..46cd7bd18b34 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -367,7 +367,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, if (!icq) return NULL; - if (radix_tree_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 8702b732109a..73899c1c3449 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -913,7 +913,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); - ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); + ret = radix_tree_maybe_preload(gfp & ~__GFP_HIGHMEM); if (ret < 0) goto nomem_free; diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ffc444c38b0a..403940787be1 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -231,6 +231,7 @@ unsigned long radix_tree_next_hole(struct radix_tree_root *root, unsigned long radix_tree_prev_hole(struct radix_tree_root *root, unsigned long index, unsigned long max_scan); int radix_tree_preload(gfp_t gfp_mask); +int radix_tree_maybe_preload(gfp_t gfp_mask); void radix_tree_init(void); void *radix_tree_tag_set(struct radix_tree_root *root, unsigned long index, unsigned int tag); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index e7964296fd50..7811ed3b4e70 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -32,6 +32,7 @@ #include #include #include +#include /* in_interrupt() */ #ifdef __KERNEL__ @@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root) struct radix_tree_node *ret = NULL; gfp_t gfp_mask = root_gfp_mask(root); - if (!(gfp_mask & __GFP_WAIT)) { + /* + * Preload code isn't irq safe and it doesn't make sence to use + * preloading in the interrupt anyway as all the allocations have to + * be atomic. So just do normal allocation when in interrupt. + */ + if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) { struct radix_tree_preload *rtp; /* @@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_WAIT being passed to INIT_RADIX_TREE(). */ -int radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -288,8 +294,39 @@ int radix_tree_preload(gfp_t gfp_mask) out: return ret; } + +/* + * Load up this CPU's radix_tree_node buffer with sufficient objects to + * ensure that the addition of a single element in the tree cannot fail. On + * success, return zero, with preemption disabled. On error, return -ENOMEM + * with preemption not disabled. + * + * To make use of this facility, the radix tree must be initialised without + * __GFP_WAIT being passed to INIT_RADIX_TREE(). + */ +int radix_tree_preload(gfp_t gfp_mask) +{ + /* Warn on non-sensical use... */ + WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT)); + return __radix_tree_preload(gfp_mask); +} EXPORT_SYMBOL(radix_tree_preload); +/* + * The same as above function, except we don't guarantee preloading happens. + * We do it, if we decide it helps. On success, return zero with preemption + * disabled. On error, return -ENOMEM with preemption not disabled. + */ +int radix_tree_maybe_preload(gfp_t gfp_mask) +{ + if (gfp_mask & __GFP_WAIT) + return __radix_tree_preload(gfp_mask); + /* Preloading doesn't help anything with this gfp mask, skip it */ + preempt_disable(); + return 0; +} +EXPORT_SYMBOL(radix_tree_maybe_preload); + /* * Return the maximum key which can be store into a * radix tree with height HEIGHT. diff --git a/mm/filemap.c b/mm/filemap.c index 731a2c24532d..e607728db4a8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -469,7 +469,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (error) goto out; - error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); + error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error == 0) { page_cache_get(page); page->mapping = mapping; diff --git a/mm/shmem.c b/mm/shmem.c index 526149846d0a..a1b8bf4391c2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1205,7 +1205,7 @@ repeat: gfp & GFP_RECLAIM_MASK); if (error) goto decused; - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, NULL); diff --git a/mm/swap_state.c b/mm/swap_state.c index f24ab0dff554..e6f15f8ca2af 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -122,7 +122,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) { int error; - error = radix_tree_preload(gfp_mask); + error = radix_tree_maybe_preload(gfp_mask); if (!error) { error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); @@ -328,7 +328,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * call radix_tree_preload() while we can wait. */ - err = radix_tree_preload(gfp_mask & GFP_KERNEL); + err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL); if (err) break; From 137fdcc18a5979b53c0a1379b25fc68724e98a45 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 11 Sep 2013 14:26:06 -0700 Subject: [PATCH 283/303] initmpfs: replace MS_NOUSER in initramfs Mounting MS_NOUSER prevents --bind mounts from rootfs. Prevent new rootfs mounts with a different mechanism that doesn't affect bind mounts. Signed-off-by: Rob Landley Cc: Jeff Layton Cc: Jens Axboe Cc: Stephen Warren Cc: Rusty Russell Cc: Jim Cromie Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index c24f1e10b946..8f7fe323e049 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -247,7 +247,12 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type, static struct dentry *rootfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_nodev(fs_type, flags|MS_NOUSER, data, ramfs_fill_super); + static unsigned long once; + + if (test_and_set_bit(1, &once)) + return ERR_PTR(-ENODEV); + + return mount_nodev(fs_type, flags, data, ramfs_fill_super); } static void ramfs_kill_sb(struct super_block *sb) From 4bbee76bc986af326be0a84ad661000cf89b29f6 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 11 Sep 2013 14:26:08 -0700 Subject: [PATCH 284/303] initmpfs: move bdi setup from init_rootfs to init_ramfs Even though ramfs hasn't got a backing device, commit e0bf68ddec4f ("mm: bdi init hooks") added one anyway, and put the initialization in init_rootfs() since that's the first user, leaving it out of init_ramfs() to avoid duplication. But initmpfs uses init_tmpfs() instead, so move the init into the filesystem's init function, add a "once" guard to prevent duplicate initialization, and call the filesystem init from rootfs init. This goes part of the way to allowing ramfs to be built as a module. [akpm@linux-foundation.org; using bit 1 was odd] Signed-off-by: Rob Landley Cc: Jeff Layton Cc: Jens Axboe Cc: Stephen Warren Cc: Rusty Russell Cc: Jim Cromie Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/inode.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 8f7fe323e049..fb99863598be 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -249,7 +249,7 @@ static struct dentry *rootfs_mount(struct file_system_type *fs_type, { static unsigned long once; - if (test_and_set_bit(1, &once)) + if (test_and_set_bit(0, &once)) return ERR_PTR(-ENODEV); return mount_nodev(fs_type, flags, data, ramfs_fill_super); @@ -275,21 +275,34 @@ static struct file_system_type rootfs_fs_type = { static int __init init_ramfs_fs(void) { - return register_filesystem(&ramfs_fs_type); -} -module_init(init_ramfs_fs) - -int __init init_rootfs(void) -{ + static unsigned long once; int err; + if (test_and_set_bit(0, &once)) + return 0; + err = bdi_init(&ramfs_backing_dev_info); if (err) return err; - err = register_filesystem(&rootfs_fs_type); + err = register_filesystem(&ramfs_fs_type); if (err) bdi_destroy(&ramfs_backing_dev_info); return err; } +module_init(init_ramfs_fs) + +int __init init_rootfs(void) +{ + int err = register_filesystem(&rootfs_fs_type); + + if (err) + return err; + + err = init_ramfs_fs(); + if (err) + unregister_filesystem(&rootfs_fs_type); + + return err; +} From 57f150a58c40cda598c31af8bceb8598f43c3e5f Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 11 Sep 2013 14:26:10 -0700 Subject: [PATCH 285/303] initmpfs: move rootfs code from fs/ramfs/ to init/ When the rootfs code was a wrapper around ramfs, having them in the same file made sense. Now that it can wrap another filesystem type, move it in with the init code instead. This also allows a subsequent patch to access rootfstype= command line arg. Signed-off-by: Rob Landley Cc: Jeff Layton Cc: Jens Axboe Cc: Stephen Warren Cc: Rusty Russell Cc: Jim Cromie Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 2 +- fs/ramfs/inode.c | 32 +------------------------------- include/linux/init.h | 1 + include/linux/ramfs.h | 2 +- init/do_mounts.c | 32 ++++++++++++++++++++++++++++++++ 5 files changed, 36 insertions(+), 33 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 25845d1b300b..da5c49483430 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -17,7 +17,7 @@ #include #include #include /* acct_auto_close_mnt */ -#include /* init_rootfs */ +#include /* init_rootfs */ #include /* get_fs_root et.al. */ #include /* fsnotify_vfsmount_delete */ #include diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index fb99863598be..39d14659a8d3 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -244,17 +244,6 @@ struct dentry *ramfs_mount(struct file_system_type *fs_type, return mount_nodev(fs_type, flags, data, ramfs_fill_super); } -static struct dentry *rootfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - static unsigned long once; - - if (test_and_set_bit(0, &once)) - return ERR_PTR(-ENODEV); - - return mount_nodev(fs_type, flags, data, ramfs_fill_super); -} - static void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); @@ -267,13 +256,8 @@ static struct file_system_type ramfs_fs_type = { .kill_sb = ramfs_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; -static struct file_system_type rootfs_fs_type = { - .name = "rootfs", - .mount = rootfs_mount, - .kill_sb = kill_litter_super, -}; -static int __init init_ramfs_fs(void) +int __init init_ramfs_fs(void) { static unsigned long once; int err; @@ -292,17 +276,3 @@ static int __init init_ramfs_fs(void) return err; } module_init(init_ramfs_fs) - -int __init init_rootfs(void) -{ - int err = register_filesystem(&rootfs_fs_type); - - if (err) - return err; - - err = init_ramfs_fs(); - if (err) - unregister_filesystem(&rootfs_fs_type); - - return err; -} diff --git a/include/linux/init.h b/include/linux/init.h index e73f2b708525..f1c27a71d03c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -153,6 +153,7 @@ extern unsigned int reset_devices; void setup_arch(char **); void prepare_namespace(void); void __init load_default_modules(void); +int __init init_rootfs(void); extern void (*late_time_init)(void); diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h index 69e37c2d1ea5..753207c8ce20 100644 --- a/include/linux/ramfs.h +++ b/include/linux/ramfs.h @@ -25,7 +25,7 @@ extern int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma); extern const struct file_operations ramfs_file_operations; extern const struct vm_operations_struct generic_file_vm_ops; -extern int __init init_rootfs(void); +extern int __init init_ramfs_fs(void); int ramfs_fill_super(struct super_block *sb, void *data, int silent); diff --git a/init/do_mounts.c b/init/do_mounts.c index 816014c4627e..5d8d48fd0ee4 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -588,3 +589,34 @@ out: sys_mount(".", "/", NULL, MS_MOVE, NULL); sys_chroot("."); } + +static struct dentry *rootfs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + static unsigned long once; + + if (test_and_set_bit(0, &once)) + return ERR_PTR(-ENODEV); + + return mount_nodev(fs_type, flags, data, ramfs_fill_super); +} + +static struct file_system_type rootfs_fs_type = { + .name = "rootfs", + .mount = rootfs_mount, + .kill_sb = kill_litter_super, +}; + +int __init init_rootfs(void) +{ + int err = register_filesystem(&rootfs_fs_type); + + if (err) + return err; + + err = init_ramfs_fs(); + if (err) + unregister_filesystem(&rootfs_fs_type); + + return err; +} From 16203a7a9422315bc929461503e3a046459ea5ff Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 11 Sep 2013 14:26:12 -0700 Subject: [PATCH 286/303] initmpfs: make rootfs use tmpfs when CONFIG_TMPFS enabled Conditionally call the appropriate fs_init function and fill_super functions. Add a use once guard to shmem_init() to simply succeed on a second call. (Note that IS_ENABLED() is a compile time constant so dead code elimination removes unused function calls when CONFIG_TMPFS is disabled.) Signed-off-by: Rob Landley Cc: Jeff Layton Cc: Jens Axboe Cc: Stephen Warren Cc: Rusty Russell Cc: Jim Cromie Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/do_mounts.c | 10 ++++++++-- mm/shmem.c | 4 ++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/init/do_mounts.c b/init/do_mounts.c index 5d8d48fd0ee4..e27908b949d4 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -598,7 +599,8 @@ static struct dentry *rootfs_mount(struct file_system_type *fs_type, if (test_and_set_bit(0, &once)) return ERR_PTR(-ENODEV); - return mount_nodev(fs_type, flags, data, ramfs_fill_super); + return mount_nodev(fs_type, flags, data, + IS_ENABLED(CONFIG_TMPFS) ? shmem_fill_super : ramfs_fill_super); } static struct file_system_type rootfs_fs_type = { @@ -614,7 +616,11 @@ int __init init_rootfs(void) if (err) return err; - err = init_ramfs_fs(); + if (IS_ENABLED(CONFIG_TMPFS)) + err = shmem_init(); + else + err = init_ramfs_fs(); + if (err) unregister_filesystem(&rootfs_fs_type); diff --git a/mm/shmem.c b/mm/shmem.c index a1b8bf4391c2..8297623fcaed 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2819,6 +2819,10 @@ int __init shmem_init(void) { int error; + /* If rootfs called this, don't re-init */ + if (shmem_inode_cachep) + return 0; + error = bdi_init(&shmem_backing_dev_info); if (error) goto out4; From 6e19eded3684dc184181093af3bff2ff440f5b53 Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Wed, 11 Sep 2013 14:26:13 -0700 Subject: [PATCH 287/303] initmpfs: use initramfs if rootfstype= or root= specified Command line option rootfstype=ramfs to obtain old initramfs behavior, and use ramfs instead of tmpfs for stub when root= defined (for cosmetic reasons). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Rob Landley Cc: Jeff Layton Cc: Jens Axboe Cc: Stephen Warren Cc: Rusty Russell Cc: Jim Cromie Cc: Sam Ravnborg Cc: Greg Kroah-Hartman Cc: "Eric W. Biederman" Cc: Alexander Viro Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../filesystems/ramfs-rootfs-initramfs.txt | 4 ++++ init/do_mounts.c | 15 +++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt index 59b4a0962e0f..b176928e6963 100644 --- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt +++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt @@ -79,6 +79,10 @@ to just make sure certain lists can't become empty. Most systems just mount another filesystem over rootfs and ignore it. The amount of space an empty instance of ramfs takes up is tiny. +If CONFIG_TMPFS is enabled, rootfs will use tmpfs instead of ramfs by +default. To force ramfs, add "rootfstype=ramfs" to the kernel command +line. + What is initramfs? ------------------ diff --git a/init/do_mounts.c b/init/do_mounts.c index e27908b949d4..a51cddc2ff8c 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -591,16 +591,20 @@ out: sys_chroot("."); } +static bool is_tmpfs; static struct dentry *rootfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { static unsigned long once; + void *fill = ramfs_fill_super; if (test_and_set_bit(0, &once)) return ERR_PTR(-ENODEV); - return mount_nodev(fs_type, flags, data, - IS_ENABLED(CONFIG_TMPFS) ? shmem_fill_super : ramfs_fill_super); + if (IS_ENABLED(CONFIG_TMPFS) && is_tmpfs) + fill = shmem_fill_super; + + return mount_nodev(fs_type, flags, data, fill); } static struct file_system_type rootfs_fs_type = { @@ -616,10 +620,13 @@ int __init init_rootfs(void) if (err) return err; - if (IS_ENABLED(CONFIG_TMPFS)) + if (IS_ENABLED(CONFIG_TMPFS) && !saved_root_name[0] && + (!root_fs_names || strstr(root_fs_names, "tmpfs"))) { err = shmem_init(); - else + is_tmpfs = true; + } else { err = init_ramfs_fs(); + } if (err) unregister_filesystem(&rootfs_fs_type); From 8b8d52ac382b17a19906b930cd69e2edb0aca8ba Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:15 -0700 Subject: [PATCH 288/303] ipc,shm: introduce lockless functions to obtain the ipc object This is the third and final patchset that deals with reducing the amount of contention we impose on the ipc lock (kern_ipc_perm.lock). These changes mostly deal with shared memory, previous work has already been done for semaphores and message queues: http://lkml.org/lkml/2013/3/20/546 (sems) http://lkml.org/lkml/2013/5/15/584 (mqueues) With these patches applied, a custom shm microbenchmark stressing shmctl doing IPC_STAT with 4 threads a million times, reduces the execution time by 50%. A similar run, this time with IPC_SET, reduces the execution time from 3 mins and 35 secs to 27 seconds. Patches 1-8: replaces blindly taking the ipc lock for a smarter combination of rcu and ipc_obtain_object, only acquiring the spinlock when updating. Patch 9: renames the ids rw_mutex to rwsem, which is what it already was. Patch 10: is a trivial mqueue leftover cleanup Patch 11: adds a brief lock scheme description, requested by Andrew. This patch: Add shm_obtain_object() and shm_obtain_object_check(), which will allow us to get the ipc object without acquiring the lock. Just as with other forms of ipc, these functions are basically wrappers around ipc_obtain_object*(). Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/ipc/shm.c b/ipc/shm.c index c6b4ad5ce3b7..216ae727a936 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -124,6 +124,26 @@ void __init shm_init (void) IPC_SHM_IDS, sysvipc_shm_proc_show); } +static inline struct shmid_kernel *shm_obtain_object(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + +static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace *ns, int id) +{ + struct kern_ipc_perm *ipcp = ipc_obtain_object_check(&shm_ids(ns), id); + + if (IS_ERR(ipcp)) + return ERR_CAST(ipcp); + + return container_of(ipcp, struct shmid_kernel, shm_perm); +} + /* * shm_lock_(check_) routines are called in the paths where the rw_mutex * is not necessarily held. From 79ccf0f8c8e04e8b9eda6645ba0f63b0915a3075 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:16 -0700 Subject: [PATCH 289/303] ipc,shm: shorten critical region in shmctl_down Instead of holding the ipc lock for the entire function, use the ipcctl_pre_down_nolock and only acquire the lock for specific commands: RMID and SET. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 216ae727a936..22cffd78dbb1 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -780,11 +780,10 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, down_write(&shm_ids(ns).rw_mutex); rcu_read_lock(); - ipcp = ipcctl_pre_down(ns, &shm_ids(ns), shmid, cmd, - &shmid64.shm_perm, 0); + ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, + &shmid64.shm_perm, 0); if (IS_ERR(ipcp)) { err = PTR_ERR(ipcp); - /* the ipc lock is not held upon failure */ goto out_unlock1; } @@ -792,14 +791,16 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, err = security_shm_shmctl(shp, cmd); if (err) - goto out_unlock0; + goto out_unlock1; switch (cmd) { case IPC_RMID: + ipc_lock_object(&shp->shm_perm); /* do_shm_rmid unlocks the ipc object and rcu */ do_shm_rmid(ns, ipcp); goto out_up; case IPC_SET: + ipc_lock_object(&shp->shm_perm); err = ipc_update_perm(&shmid64.shm_perm, ipcp); if (err) goto out_unlock0; @@ -807,6 +808,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, break; default: err = -EINVAL; + goto out_unlock1; } out_unlock0: From 3b1c4ad37741e53804ffe0a30dd01e08b2ab6241 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:17 -0700 Subject: [PATCH 290/303] ipc: drop ipcctl_pre_down Now that sem, msgque and shm, through *_down(), all use the lockless variant of ipcctl_pre_down(), go ahead and delete it. [akpm@linux-foundation.org: fix function name in kerneldoc, cleanups] Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 24 ++++-------------------- ipc/util.h | 3 --- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 4704223bfad4..2c8a93b380ba 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -733,7 +733,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) } /** - * ipcctl_pre_down - retrieve an ipc and check permissions for some IPC_XXX cmd + * ipcctl_pre_down_nolock - retrieve an ipc and check permissions for some IPC_XXX cmd * @ns: the ipc namespace * @ids: the table of ids where to look for the ipc * @id: the id of the ipc to retrieve @@ -746,29 +746,13 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * It must be called without any lock held and * - retrieves the ipc with the given id in the given table. * - performs some audit and permission check, depending on the given cmd - * - returns the ipc with the ipc lock held in case of success - * or an err-code without any lock held otherwise. + * - returns a pointer to the ipc object or otherwise, the corresponding error. * * Call holding the both the rw_mutex and the rcu read lock. */ -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) -{ - struct kern_ipc_perm *ipcp; - - ipcp = ipcctl_pre_down_nolock(ns, ids, id, cmd, perm, extra_perm); - if (IS_ERR(ipcp)) - goto out; - - spin_lock(&ipcp->lock); -out: - return ipcp; -} - struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm) + struct ipc_ids *ids, int id, int cmd, + struct ipc64_perm *perm, int extra_perm) { kuid_t euid; int err = -EPERM; diff --git a/ipc/util.h b/ipc/util.h index b6a6a88f3002..41a6c4d26399 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -131,9 +131,6 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out); struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, struct ipc64_perm *perm, int extra_perm); -struct kern_ipc_perm *ipcctl_pre_down(struct ipc_namespace *ns, - struct ipc_ids *ids, int id, int cmd, - struct ipc64_perm *perm, int extra_perm); #ifndef CONFIG_ARCH_WANT_IPC_PARSE_VERSION /* On IA-64, we always use the "64-bit version" of the IPC structures. */ From 68eccc1dc345539d589ae78ee43b835c1a06a134 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:18 -0700 Subject: [PATCH 291/303] ipc,shm: introduce shmctl_nolock Similar to semctl and msgctl, when calling msgctl, the *_INFO and *_STAT commands can be performed without acquiring the ipc object. Add a shmctl_nolock() function and move the logic of *_INFO and *_STAT out of msgctl(). Since we are just moving functionality, this change still takes the lock and it will be properly lockless in the next patch. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 61 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 22cffd78dbb1..3e123987f054 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -820,28 +820,23 @@ out_up: return err; } -SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +static int shmctl_nolock(struct ipc_namespace *ns, int shmid, + int cmd, int version, void __user *buf) { + int err; struct shmid_kernel *shp; - int err, version; - struct ipc_namespace *ns; - - if (cmd < 0 || shmid < 0) { - err = -EINVAL; - goto out; - } - - version = ipc_parse_version(&cmd); - ns = current->nsproxy->ipc_ns; - - switch (cmd) { /* replace with proc interface ? */ - case IPC_INFO: - { - struct shminfo64 shminfo; + /* preliminary security checks for *_INFO */ + if (cmd == IPC_INFO || cmd == SHM_INFO) { err = security_shm_shmctl(NULL, cmd); if (err) return err; + } + + switch (cmd) { + case IPC_INFO: + { + struct shminfo64 shminfo; memset(&shminfo, 0, sizeof(shminfo)); shminfo.shmmni = shminfo.shmseg = ns->shm_ctlmni; @@ -864,10 +859,6 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) { struct shm_info shm_info; - err = security_shm_shmctl(NULL, cmd); - if (err) - return err; - memset(&shm_info, 0, sizeof(shm_info)); down_read(&shm_ids(ns).rw_mutex); shm_info.used_ids = shm_ids(ns).in_use; @@ -928,6 +919,36 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) err = result; goto out; } + default: + return -EINVAL; + } + +out_unlock: + shm_unlock(shp); +out: + return err; +} + +SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) +{ + struct shmid_kernel *shp; + int err, version; + struct ipc_namespace *ns; + + if (cmd < 0 || shmid < 0) { + err = -EINVAL; + goto out; + } + + version = ipc_parse_version(&cmd); + ns = current->nsproxy->ipc_ns; + + switch (cmd) { + case IPC_INFO: + case SHM_INFO: + case SHM_STAT: + case IPC_STAT: + return shmctl_nolock(ns, shmid, cmd, version, buf); case SHM_LOCK: case SHM_UNLOCK: { From c97cb9ccab8c85428ec21eff690642ad2ce1fa8a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:20 -0700 Subject: [PATCH 292/303] ipc,shm: make shmctl_nolock lockless While the INFO cmd doesn't take the ipc lock, the STAT commands do acquire it unnecessarily. We can do the permissions and security checks only holding the rcu lock. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 3e123987f054..a493639550d9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -882,27 +882,31 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, struct shmid64_ds tbuf; int result; + rcu_read_lock(); if (cmd == SHM_STAT) { - shp = shm_lock(ns, shmid); + shp = shm_obtain_object(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = shp->shm_perm.id; } else { - shp = shm_lock_check(ns, shmid); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } result = 0; } + err = -EACCES; if (ipcperms(ns, &shp->shm_perm, S_IRUGO)) goto out_unlock; + err = security_shm_shmctl(shp, cmd); if (err) goto out_unlock; + memset(&tbuf, 0, sizeof(tbuf)); kernel_to_ipc64_perm(&shp->shm_perm, &tbuf.shm_perm); tbuf.shm_segsz = shp->shm_segsz; @@ -912,8 +916,9 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, tbuf.shm_cpid = shp->shm_cprid; tbuf.shm_lpid = shp->shm_lprid; tbuf.shm_nattch = shp->shm_nattch; - shm_unlock(shp); - if(copy_shmid_to_user (buf, &tbuf, version)) + rcu_read_unlock(); + + if (copy_shmid_to_user(buf, &tbuf, version)) err = -EFAULT; else err = result; @@ -924,7 +929,7 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, } out_unlock: - shm_unlock(shp); + rcu_read_unlock(); out: return err; } From 2caacaa82a51b78fc0c800e206473874094287ed Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:21 -0700 Subject: [PATCH 293/303] ipc,shm: shorten critical region for shmctl With the *_INFO, *_STAT, IPC_RMID and IPC_SET commands already optimized, deal with the remaining SHM_LOCK and SHM_UNLOCK commands. Take the shm_perm lock after doing the initial auditing and security checks. The rest of the logic remains unchanged. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index a493639550d9..8ec381085dec 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -940,10 +940,8 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) int err, version; struct ipc_namespace *ns; - if (cmd < 0 || shmid < 0) { - err = -EINVAL; - goto out; - } + if (cmd < 0 || shmid < 0) + return -EINVAL; version = ipc_parse_version(&cmd); ns = current->nsproxy->ipc_ns; @@ -954,36 +952,40 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) case SHM_STAT: case IPC_STAT: return shmctl_nolock(ns, shmid, cmd, version, buf); + case IPC_RMID: + case IPC_SET: + return shmctl_down(ns, shmid, cmd, buf, version); case SHM_LOCK: case SHM_UNLOCK: { struct file *shm_file; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock1; } audit_ipc_obj(&(shp->shm_perm)); + err = security_shm_shmctl(shp, cmd); + if (err) + goto out_unlock1; + ipc_lock_object(&shp->shm_perm); if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) { kuid_t euid = current_euid(); err = -EPERM; if (!uid_eq(euid, shp->shm_perm.uid) && !uid_eq(euid, shp->shm_perm.cuid)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) - goto out_unlock; + goto out_unlock0; } - err = security_shm_shmctl(shp, cmd); - if (err) - goto out_unlock; - shm_file = shp->shm_file; if (is_file_hugepages(shm_file)) - goto out_unlock; + goto out_unlock0; if (cmd == SHM_LOCK) { struct user_struct *user = current_user(); @@ -992,32 +994,31 @@ SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf) shp->shm_perm.mode |= SHM_LOCKED; shp->mlock_user = user; } - goto out_unlock; + goto out_unlock0; } /* SHM_UNLOCK */ if (!(shp->shm_perm.mode & SHM_LOCKED)) - goto out_unlock; + goto out_unlock0; shmem_lock(shm_file, 0, shp->mlock_user); shp->shm_perm.mode &= ~SHM_LOCKED; shp->mlock_user = NULL; get_file(shm_file); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); shmem_unlock_mapping(shm_file->f_mapping); + fput(shm_file); - goto out; - } - case IPC_RMID: - case IPC_SET: - err = shmctl_down(ns, shmid, cmd, buf, version); return err; + } default: return -EINVAL; } -out_unlock: - shm_unlock(shp); -out: +out_unlock0: + ipc_unlock_object(&shp->shm_perm); +out_unlock1: + rcu_read_unlock(); return err; } From f42569b1388b1408b574a5e93a23a663647d4181 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:22 -0700 Subject: [PATCH 294/303] ipc,shm: cleanup do_shmat pasta Clean up some of the messy do_shmat() spaghetti code, getting rid of out_free and out_put_dentry labels. This makes shortening the critical region of this function in the next patch a little easier to do and read. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 8ec381085dec..115dccebc63e 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1108,16 +1108,21 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); - if (!sfd) - goto out_put_dentry; + if (!sfd) { + path_put(&path); + goto out_nattch; + } file = alloc_file(&path, f_mode, is_file_hugepages(shp->shm_file) ? &shm_file_operations_huge : &shm_file_operations); err = PTR_ERR(file); - if (IS_ERR(file)) - goto out_free; + if (IS_ERR(file)) { + kfree(sfd); + path_put(&path); + goto out_nattch; + } file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; @@ -1143,7 +1148,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, addr > current->mm->start_stack - size - PAGE_SIZE * 5) goto invalid; } - + addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); *raddr = addr; err = 0; @@ -1167,19 +1172,12 @@ out_nattch: else shm_unlock(shp); up_write(&shm_ids(ns).rw_mutex); - -out: return err; out_unlock: shm_unlock(shp); - goto out; - -out_free: - kfree(sfd); -out_put_dentry: - path_put(&path); - goto out_nattch; +out: + return err; } SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg) From c2c737a0461e61a34676bd0bd1bc1a70a1b4e396 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:23 -0700 Subject: [PATCH 295/303] ipc,shm: shorten critical region for shmat Similar to other system calls, acquire the kern_ipc_perm lock after doing the initial permission and security checks. [sasha.levin@oracle.com: dont leave do_shmat with rcu lock held] Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index 115dccebc63e..28d19f4ece4b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -19,6 +19,9 @@ * namespaces support * OpenVZ, SWsoft Inc. * Pavel Emelianov + * + * Better ipc lock (kern_ipc_perm.lock) handling + * Davidlohr Bueso , June 2013. */ #include @@ -1086,10 +1089,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, * additional creator id... */ ns = current->nsproxy->ipc_ns; - shp = shm_lock_check(ns, shmid); + rcu_read_lock(); + shp = shm_obtain_object_check(ns, shmid); if (IS_ERR(shp)) { err = PTR_ERR(shp); - goto out; + goto out_unlock; } err = -EACCES; @@ -1100,11 +1104,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, if (err) goto out_unlock; + ipc_lock_object(&shp->shm_perm); path = shp->shm_file->f_path; path_get(&path); shp->shm_nattch++; size = i_size_read(path.dentry->d_inode); - shm_unlock(shp); + ipc_unlock_object(&shp->shm_perm); + rcu_read_unlock(); err = -ENOMEM; sfd = kzalloc(sizeof(*sfd), GFP_KERNEL); @@ -1175,7 +1181,7 @@ out_nattch: return err; out_unlock: - shm_unlock(shp); + rcu_read_unlock(); out: return err; } From d9a605e40b1376eb02b067d7690580255a0df68f Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:24 -0700 Subject: [PATCH 296/303] ipc: rename ids->rw_mutex Since in some situations the lock can be shared for readers, we shouldn't be calling it a mutex, rename it to rwsem. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ipc_namespace.h | 2 +- ipc/msg.c | 20 ++++++------- ipc/namespace.c | 4 +-- ipc/sem.c | 24 +++++++-------- ipc/shm.c | 56 +++++++++++++++++------------------ ipc/util.c | 28 +++++++++--------- ipc/util.h | 4 +-- 7 files changed, 69 insertions(+), 69 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c4d870b0d5e6..19c19a5eee29 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -22,7 +22,7 @@ struct ipc_ids { int in_use; unsigned short seq; unsigned short seq_max; - struct rw_semaphore rw_mutex; + struct rw_semaphore rwsem; struct idr ipcs_idr; int next_id; }; diff --git a/ipc/msg.c b/ipc/msg.c index b65fdf1a09dd..8203e71bcfbc 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -172,7 +172,7 @@ static inline void msg_rmid(struct ipc_namespace *ns, struct msg_queue *s) * @ns: namespace * @params: ptr to the structure that contains the key and msgflg * - * Called with msg_ids.rw_mutex held (writer) + * Called with msg_ids.rwsem held (writer) */ static int newque(struct ipc_namespace *ns, struct ipc_params *params) { @@ -259,8 +259,8 @@ static void expunge_all(struct msg_queue *msq, int res) * removes the message queue from message queue ID IDR, and cleans up all the * messages associated with this queue. * - * msg_ids.rw_mutex (writer) and the spinlock for this message queue are held - * before freeque() is called. msg_ids.rw_mutex remains locked on exit. + * msg_ids.rwsem (writer) and the spinlock for this message queue are held + * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -282,7 +282,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) } /* - * Called with msg_ids.rw_mutex and ipcp locked. + * Called with msg_ids.rwsem and ipcp locked. */ static inline int msg_security(struct kern_ipc_perm *ipcp, int msgflg) { @@ -386,9 +386,9 @@ copy_msqid_from_user(struct msqid64_ds *out, void __user *buf, int version) } /* - * This function handles some msgctl commands which require the rw_mutex + * This function handles some msgctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, struct msqid_ds __user *buf, int version) @@ -403,7 +403,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, return -EFAULT; } - down_write(&msg_ids(ns).rw_mutex); + down_write(&msg_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &msg_ids(ns), msqid, cmd, @@ -459,7 +459,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&msg_ids(ns).rw_mutex); + up_write(&msg_ids(ns).rwsem); return err; } @@ -494,7 +494,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgmnb = ns->msg_ctlmnb; msginfo.msgssz = MSGSSZ; msginfo.msgseg = MSGSEG; - down_read(&msg_ids(ns).rw_mutex); + down_read(&msg_ids(ns).rwsem); if (cmd == MSG_INFO) { msginfo.msgpool = msg_ids(ns).in_use; msginfo.msgmap = atomic_read(&ns->msg_hdrs); @@ -505,7 +505,7 @@ static int msgctl_nolock(struct ipc_namespace *ns, int msqid, msginfo.msgtql = MSGTQL; } max_id = ipc_get_maxid(&msg_ids(ns)); - up_read(&msg_ids(ns).rw_mutex); + up_read(&msg_ids(ns).rwsem); if (copy_to_user(buf, &msginfo, sizeof(struct msginfo))) return -EFAULT; return (max_id < 0) ? 0 : max_id; diff --git a/ipc/namespace.c b/ipc/namespace.c index 4be6581d3b7f..d43d9384bb2d 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -81,7 +81,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, int next_id; int total, in_use; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); in_use = ids->in_use; @@ -93,7 +93,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, free(ns, perm); total++; } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); } static void free_ipc_ns(struct ipc_namespace *ns) diff --git a/ipc/sem.c b/ipc/sem.c index 41088899783d..69b6a21f3844 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -322,7 +322,7 @@ static inline void sem_unlock(struct sem_array *sma, int locknum) } /* - * sem_lock_(check_) routines are called in the paths where the rw_mutex + * sem_lock_(check_) routines are called in the paths where the rwsem * is not held. * * The caller holds the RCU read lock. @@ -426,7 +426,7 @@ static inline void sem_rmid(struct ipc_namespace *ns, struct sem_array *s) * @ns: namespace * @params: ptr to the structure that contains key, semflg and nsems * - * Called with sem_ids.rw_mutex held (as a writer) + * Called with sem_ids.rwsem held (as a writer) */ static int newary(struct ipc_namespace *ns, struct ipc_params *params) @@ -492,7 +492,7 @@ static int newary(struct ipc_namespace *ns, struct ipc_params *params) /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) { @@ -503,7 +503,7 @@ static inline int sem_security(struct kern_ipc_perm *ipcp, int semflg) } /* - * Called with sem_ids.rw_mutex and ipcp locked. + * Called with sem_ids.rwsem and ipcp locked. */ static inline int sem_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -994,8 +994,8 @@ static int count_semzcnt (struct sem_array * sma, ushort semnum) return semzcnt; } -/* Free a semaphore set. freeary() is called with sem_ids.rw_mutex locked - * as a writer and the spinlock for this semaphore set hold. sem_ids.rw_mutex +/* Free a semaphore set. freeary() is called with sem_ids.rwsem locked + * as a writer and the spinlock for this semaphore set hold. sem_ids.rwsem * remains locked on exit. */ static void freeary(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) @@ -1116,7 +1116,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semmnu = SEMMNU; seminfo.semmap = SEMMAP; seminfo.semume = SEMUME; - down_read(&sem_ids(ns).rw_mutex); + down_read(&sem_ids(ns).rwsem); if (cmd == SEM_INFO) { seminfo.semusz = sem_ids(ns).in_use; seminfo.semaem = ns->used_sems; @@ -1125,7 +1125,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid, seminfo.semaem = SEMAEM; } max_id = ipc_get_maxid(&sem_ids(ns)); - up_read(&sem_ids(ns).rw_mutex); + up_read(&sem_ids(ns).rwsem); if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) return -EFAULT; return (max_id < 0) ? 0: max_id; @@ -1431,9 +1431,9 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version) } /* - * This function handles some semctl commands which require the rw_mutex + * This function handles some semctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int semctl_down(struct ipc_namespace *ns, int semid, int cmd, int version, void __user *p) @@ -1448,7 +1448,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid, return -EFAULT; } - down_write(&sem_ids(ns).rw_mutex); + down_write(&sem_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &sem_ids(ns), semid, cmd, @@ -1487,7 +1487,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&sem_ids(ns).rw_mutex); + up_write(&sem_ids(ns).rwsem); return err; } diff --git a/ipc/shm.c b/ipc/shm.c index 28d19f4ece4b..cb2cedaa8808 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -83,8 +83,8 @@ void shm_init_ns(struct ipc_namespace *ns) } /* - * Called with shm_ids.rw_mutex (writer) and the shp structure locked. - * Only shm_ids.rw_mutex remains locked on exit. + * Called with shm_ids.rwsem (writer) and the shp structure locked. + * Only shm_ids.rwsem remains locked on exit. */ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { @@ -148,7 +148,7 @@ static inline struct shmid_kernel *shm_obtain_object_check(struct ipc_namespace } /* - * shm_lock_(check_) routines are called in the paths where the rw_mutex + * shm_lock_(check_) routines are called in the paths where the rwsem * is not necessarily held. */ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id) @@ -205,7 +205,7 @@ static void shm_open(struct vm_area_struct *vma) * @ns: namespace * @shp: struct to free * - * It has to be called with shp and shm_ids.rw_mutex (writer) locked, + * It has to be called with shp and shm_ids.rwsem (writer) locked, * but returns with shp unlocked and freed. */ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) @@ -253,7 +253,7 @@ static void shm_close(struct vm_area_struct *vma) struct shmid_kernel *shp; struct ipc_namespace *ns = sfd->ns; - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); /* remove from the list of attaches of the shm segment */ shp = shm_lock(ns, sfd->id); BUG_ON(IS_ERR(shp)); @@ -264,10 +264,10 @@ static void shm_close(struct vm_area_struct *vma) shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_current(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -298,7 +298,7 @@ static int shm_try_destroy_current(int id, void *p, void *data) return 0; } -/* Called with ns->shm_ids(ns).rw_mutex locked */ +/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -309,7 +309,7 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) * We want to destroy segments without users and with already * exit'ed originating process. * - * As shp->* are changed under rw_mutex, it's safe to skip shp locking. + * As shp->* are changed under rwsem, it's safe to skip shp locking. */ if (shp->shm_creator != NULL) return 0; @@ -323,10 +323,10 @@ static int shm_try_destroy_orphaned(int id, void *p, void *data) void shm_destroy_orphaned(struct ipc_namespace *ns) { - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_orphaned, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } @@ -338,10 +338,10 @@ void exit_shm(struct task_struct *task) return; /* Destroy all already created segments, but not mapped yet */ - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); if (shm_ids(ns).in_use) idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); } static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -475,7 +475,7 @@ static const struct vm_operations_struct shm_vm_ops = { * @ns: namespace * @params: ptr to the structure that contains key, size and shmflg * - * Called with shm_ids.rw_mutex held as a writer. + * Called with shm_ids.rwsem held as a writer. */ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) @@ -583,7 +583,7 @@ no_file: } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) { @@ -594,7 +594,7 @@ static inline int shm_security(struct kern_ipc_perm *ipcp, int shmflg) } /* - * Called with shm_ids.rw_mutex and ipcp locked. + * Called with shm_ids.rwsem and ipcp locked. */ static inline int shm_more_checks(struct kern_ipc_perm *ipcp, struct ipc_params *params) @@ -707,7 +707,7 @@ static inline unsigned long copy_shminfo_to_user(void __user *buf, struct shminf /* * Calculate and add used RSS and swap pages of a shm. - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_add_rss_swap(struct shmid_kernel *shp, unsigned long *rss_add, unsigned long *swp_add) @@ -734,7 +734,7 @@ static void shm_add_rss_swap(struct shmid_kernel *shp, } /* - * Called with shm_ids.rw_mutex held as a reader + * Called with shm_ids.rwsem held as a reader */ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, unsigned long *swp) @@ -763,9 +763,9 @@ static void shm_get_stat(struct ipc_namespace *ns, unsigned long *rss, } /* - * This function handles some shmctl commands which require the rw_mutex + * This function handles some shmctl commands which require the rwsem * to be held in write mode. - * NOTE: no locks must be held, the rw_mutex is taken inside this function. + * NOTE: no locks must be held, the rwsem is taken inside this function. */ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, struct shmid_ds __user *buf, int version) @@ -780,7 +780,7 @@ static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd, return -EFAULT; } - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); rcu_read_lock(); ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd, @@ -819,7 +819,7 @@ out_unlock0: out_unlock1: rcu_read_unlock(); out_up: - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); return err; } @@ -850,9 +850,9 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, if(copy_shminfo_to_user (buf, &shminfo, version)) return -EFAULT; - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if(err<0) err = 0; @@ -863,14 +863,14 @@ static int shmctl_nolock(struct ipc_namespace *ns, int shmid, struct shm_info shm_info; memset(&shm_info, 0, sizeof(shm_info)); - down_read(&shm_ids(ns).rw_mutex); + down_read(&shm_ids(ns).rwsem); shm_info.used_ids = shm_ids(ns).in_use; shm_get_stat (ns, &shm_info.shm_rss, &shm_info.shm_swp); shm_info.shm_tot = ns->shm_tot; shm_info.swap_attempts = 0; shm_info.swap_successes = 0; err = ipc_get_maxid(&shm_ids(ns)); - up_read(&shm_ids(ns).rw_mutex); + up_read(&shm_ids(ns).rwsem); if (copy_to_user(buf, &shm_info, sizeof(shm_info))) { err = -EFAULT; goto out; @@ -1169,7 +1169,7 @@ out_fput: fput(file); out_nattch: - down_write(&shm_ids(ns).rw_mutex); + down_write(&shm_ids(ns).rwsem); shp = shm_lock(ns, shmid); BUG_ON(IS_ERR(shp)); shp->shm_nattch--; @@ -1177,7 +1177,7 @@ out_nattch: shm_destroy(ns, shp); else shm_unlock(shp); - up_write(&shm_ids(ns).rw_mutex); + up_write(&shm_ids(ns).rwsem); return err; out_unlock: diff --git a/ipc/util.c b/ipc/util.c index 2c8a93b380ba..9a1d779a20e2 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -119,7 +119,7 @@ __initcall(ipc_init); void ipc_init_ids(struct ipc_ids *ids) { - init_rwsem(&ids->rw_mutex); + init_rwsem(&ids->rwsem); ids->in_use = 0; ids->seq = 0; @@ -174,7 +174,7 @@ void __init ipc_init_proc_interface(const char *path, const char *header, * @ids: Identifier set * @key: The key to find * - * Requires ipc_ids.rw_mutex locked. + * Requires ipc_ids.rwsem locked. * Returns the LOCKED pointer to the ipc structure if found or NULL * if not. * If key is found ipc points to the owning ipc structure @@ -208,7 +208,7 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) * ipc_get_maxid - get the last assigned id * @ids: IPC identifier set * - * Called with ipc_ids.rw_mutex held. + * Called with ipc_ids.rwsem held. */ int ipc_get_maxid(struct ipc_ids *ids) @@ -246,7 +246,7 @@ int ipc_get_maxid(struct ipc_ids *ids) * is returned. The 'new' entry is returned in a locked state on success. * On failure the entry is not locked and a negative err-code is returned. * - * Called with writer ipc_ids.rw_mutex held. + * Called with writer ipc_ids.rwsem held. */ int ipc_addid(struct ipc_ids* ids, struct kern_ipc_perm* new, int size) { @@ -312,9 +312,9 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, { int err; - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); err = ops->getnew(ns, params); - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -331,7 +331,7 @@ static int ipcget_new(struct ipc_namespace *ns, struct ipc_ids *ids, * * On success, the IPC id is returned. * - * It is called with ipc_ids.rw_mutex and ipcp->lock held. + * It is called with ipc_ids.rwsem and ipcp->lock held. */ static int ipc_check_perms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, @@ -376,7 +376,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * Take the lock as a writer since we are potentially going to add * a new entry + read locks are not "upgradable" */ - down_write(&ids->rw_mutex); + down_write(&ids->rwsem); ipcp = ipc_findkey(ids, params->key); if (ipcp == NULL) { /* key not used */ @@ -402,7 +402,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, } ipc_unlock(ipcp); } - up_write(&ids->rw_mutex); + up_write(&ids->rwsem); return err; } @@ -413,7 +413,7 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, * @ids: IPC identifier set * @ipcp: ipc perm structure containing the identifier to remove * - * ipc_ids.rw_mutex (as a writer) and the spinlock for this ID are held + * ipc_ids.rwsem (as a writer) and the spinlock for this ID are held * before this function is called, and remain locked on the exit. */ @@ -621,7 +621,7 @@ struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id) } /** - * ipc_lock - Lock an ipc structure without rw_mutex held + * ipc_lock - Lock an ipc structure without rwsem held * @ids: IPC identifier set * @id: ipc id to look for * @@ -748,7 +748,7 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * - performs some audit and permission check, depending on the given cmd * - returns a pointer to the ipc object or otherwise, the corresponding error. * - * Call holding the both the rw_mutex and the rcu read lock. + * Call holding the both the rwsem and the rcu read lock. */ struct kern_ipc_perm *ipcctl_pre_down_nolock(struct ipc_namespace *ns, struct ipc_ids *ids, int id, int cmd, @@ -868,7 +868,7 @@ static void *sysvipc_proc_start(struct seq_file *s, loff_t *pos) * Take the lock - this will be released by the corresponding * call to stop(). */ - down_read(&ids->rw_mutex); + down_read(&ids->rwsem); /* pos < 0 is invalid */ if (*pos < 0) @@ -895,7 +895,7 @@ static void sysvipc_proc_stop(struct seq_file *s, void *it) ids = &iter->ns->ids[iface->ids]; /* Release the lock we took in start() */ - up_read(&ids->rw_mutex); + up_read(&ids->rwsem); } static int sysvipc_proc_show(struct seq_file *s, void *it) diff --git a/ipc/util.h b/ipc/util.h index 41a6c4d26399..0a362ffca972 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -94,10 +94,10 @@ void __init ipc_init_proc_interface(const char *path, const char *header, #define ipcid_to_idx(id) ((id) % SEQ_MULTIPLIER) #define ipcid_to_seqx(id) ((id) / SEQ_MULTIPLIER) -/* must be called with ids->rw_mutex acquired for writing */ +/* must be called with ids->rwsem acquired for writing */ int ipc_addid(struct ipc_ids *, struct kern_ipc_perm *, int); -/* must be called with ids->rw_mutex acquired for reading */ +/* must be called with ids->rwsem acquired for reading */ int ipc_get_maxid(struct ipc_ids *); /* must be called with both locks acquired. */ From 4718787d1f626f45ddb239912bc07266b9880044 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:25 -0700 Subject: [PATCH 297/303] ipc,msg: drop msg_unlock There is only one user left, drop this function and just call ipc_unlock_object() and rcu_read_unlock(). Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/msg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ipc/msg.c b/ipc/msg.c index 8203e71bcfbc..b0d541d42677 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -70,8 +70,6 @@ struct msg_sender { #define msg_ids(ns) ((ns)->ids[IPC_MSG_IDS]) -#define msg_unlock(msq) ipc_unlock(&(msq)->q_perm) - static void freeque(struct ipc_namespace *, struct kern_ipc_perm *); static int newque(struct ipc_namespace *, struct ipc_params *); #ifdef CONFIG_PROC_FS @@ -270,7 +268,8 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) expunge_all(msq, -EIDRM); ss_wakeup(&msq->q_senders, 1); msg_rmid(ns, msq); - msg_unlock(msq); + ipc_unlock_object(&msq->q_perm); + rcu_read_unlock(); list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { atomic_dec(&ns->msg_hdrs); From 05603c44a7627793219b0bd9a7b236099dc9cd9d Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:26 -0700 Subject: [PATCH 298/303] ipc: document general ipc locking scheme As suggested by Andrew, add a generic initial locking scheme used throughout all sysv ipc mechanisms. Documenting the ids rwsem, how rcu can be enough to do the initial checks and when to actually acquire the kern_ipc_perm.lock spinlock. I found that adding it to util.c was generic enough. Signed-off-by: Davidlohr Bueso Tested-by: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ipc/util.c b/ipc/util.c index 9a1d779a20e2..1ddadcf9a2ab 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -15,6 +15,14 @@ * Jun 2006 - namespaces ssupport * OpenVZ, SWsoft Inc. * Pavel Emelianov + * + * General sysv ipc locking scheme: + * when doing ipc id lookups, take the ids->rwsem + * rcu_read_lock() + * obtain the ipc object (kern_ipc_perm) + * perform security, capabilities, auditing and permission checks, etc. + * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() + * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) */ #include From 530fcd16d87cd2417c472a581ba5a1e501556c86 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:28 -0700 Subject: [PATCH 299/303] ipc, shm: guard against non-existant vma in shmdt(2) When !CONFIG_MMU there's a chance we can derefence a NULL pointer when the VM area isn't found - check the return value of find_vma(). Also, remove the redundant -EINVAL return: retval is set to the proper return code and *only* changed to 0, when we actually unmap the segments. Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index cb2cedaa8808..a0ed957cefc9 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1288,8 +1288,7 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr) #else /* CONFIG_MMU */ /* under NOMMU conditions, the exact address to be destroyed must be * given */ - retval = -EINVAL; - if (vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { + if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start); retval = 0; } From 32a2750010981216fb788c5190fb0e646abfab30 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:29 -0700 Subject: [PATCH 300/303] ipc: drop ipc_lock_by_ptr After previous cleanups and optimizations, this function is no longer heavily used and we don't have a good reason to keep it. Update the few remaining callers and get rid of it. Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/namespace.c | 3 ++- ipc/util.c | 6 ++++-- ipc/util.h | 6 ------ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/ipc/namespace.c b/ipc/namespace.c index d43d9384bb2d..59451c1e214d 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -89,7 +89,8 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, perm = idr_find(&ids->ipcs_idr, next_id); if (perm == NULL) continue; - ipc_lock_by_ptr(perm); + rcu_read_lock(); + ipc_lock_object(perm); free(ns, perm); total++; } diff --git a/ipc/util.c b/ipc/util.c index 1ddadcf9a2ab..9f6aa30d2e0f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -205,7 +205,8 @@ static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key) continue; } - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } @@ -838,7 +839,8 @@ static struct kern_ipc_perm *sysvipc_find_ipc(struct ipc_ids *ids, loff_t pos, ipc = idr_find(&ids->ipcs_idr, pos); if (ipc != NULL) { *new_pos = pos + 1; - ipc_lock_by_ptr(ipc); + rcu_read_lock(); + ipc_lock_object(ipc); return ipc; } } diff --git a/ipc/util.h b/ipc/util.h index 0a362ffca972..14b0a2adba08 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -171,12 +171,6 @@ static inline void ipc_assert_locked_object(struct kern_ipc_perm *perm) assert_spin_locked(&perm->lock); } -static inline void ipc_lock_by_ptr(struct kern_ipc_perm *perm) -{ - rcu_read_lock(); - ipc_lock_object(perm); -} - static inline void ipc_unlock(struct kern_ipc_perm *perm) { ipc_unlock_object(perm); From 7a25dd9e042b2b94202a67e5551112f4ac87285a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:30 -0700 Subject: [PATCH 301/303] ipc, shm: drop shm_lock_check This function was replaced by a the lockless shm_obtain_object_check(), and no longer has any users. Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/shm.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index a0ed957cefc9..2821cdf93adb 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -167,17 +167,6 @@ static inline void shm_lock_by_ptr(struct shmid_kernel *ipcp) ipc_lock_object(&ipcp->shm_perm); } -static inline struct shmid_kernel *shm_lock_check(struct ipc_namespace *ns, - int id) -{ - struct kern_ipc_perm *ipcp = ipc_lock_check(&shm_ids(ns), id); - - if (IS_ERR(ipcp)) - return (struct shmid_kernel *)ipcp; - - return container_of(ipcp, struct shmid_kernel, shm_perm); -} - static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { ipc_rmid(&shm_ids(ns), &s->shm_perm); From 20b8875abcf2daa1dda5cf70bd6369df5e85d4c1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Sep 2013 14:26:31 -0700 Subject: [PATCH 302/303] ipc: drop ipc_lock_check No remaining users, we now use ipc_obtain_object_check(). Signed-off-by: Davidlohr Bueso Cc: Sedat Dilek Cc: Rik van Riel Cc: Manfred Spraul Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/util.c | 16 ---------------- ipc/util.h | 1 - 2 files changed, 17 deletions(-) diff --git a/ipc/util.c b/ipc/util.c index 9f6aa30d2e0f..e829da9ed01f 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -686,22 +686,6 @@ out: return out; } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id) -{ - struct kern_ipc_perm *out; - - out = ipc_lock(ids, id); - if (IS_ERR(out)) - return out; - - if (ipc_checkid(out, id)) { - ipc_unlock(out); - return ERR_PTR(-EIDRM); - } - - return out; -} - /** * ipcget - Common sys_*get() code * @ns : namsepace diff --git a/ipc/util.h b/ipc/util.h index 14b0a2adba08..c5f3338ba1fa 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -177,7 +177,6 @@ static inline void ipc_unlock(struct kern_ipc_perm *perm) rcu_read_unlock(); } -struct kern_ipc_perm *ipc_lock_check(struct ipc_ids *ids, int id); struct kern_ipc_perm *ipc_obtain_object_check(struct ipc_ids *ids, int id); int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids, struct ipc_ops *ops, struct ipc_params *params); From b34081f1cd59585451efaa69e1dff1b9507e6c89 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Sep 2013 14:26:32 -0700 Subject: [PATCH 303/303] lz4: fix compression/decompression signedness mismatch LZ4 compression and decompression functions require different in signedness input/output parameters: unsigned char for compression and signed char for decompression. Change decompression API to require "(const) unsigned char *". Signed-off-by: Sergey Senozhatsky Cc: Kyungsik Lee Cc: Geert Uytterhoeven Cc: Yann Collet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lz4.h | 8 ++++---- lib/lz4/lz4_decompress.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/lz4.h b/include/linux/lz4.h index d21c13f10a64..4356686b0a39 100644 --- a/include/linux/lz4.h +++ b/include/linux/lz4.h @@ -67,8 +67,8 @@ int lz4hc_compress(const unsigned char *src, size_t src_len, * note : Destination buffer must be already allocated. * slightly faster than lz4_decompress_unknownoutputsize() */ -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len); +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len); /* * lz4_decompress_unknownoutputsize() @@ -82,6 +82,6 @@ int lz4_decompress(const char *src, size_t *src_len, char *dest, * Error if return (< 0) * note : Destination buffer must be already allocated. */ -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len); +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len); #endif diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c index 411be80ddb46..df6839e3ce08 100644 --- a/lib/lz4/lz4_decompress.c +++ b/lib/lz4/lz4_decompress.c @@ -283,8 +283,8 @@ _output_error: return (int) (-(((char *) ip) - source)); } -int lz4_decompress(const char *src, size_t *src_len, char *dest, - size_t actual_dest_len) +int lz4_decompress(const unsigned char *src, size_t *src_len, + unsigned char *dest, size_t actual_dest_len) { int ret = -1; int input_len = 0; @@ -302,8 +302,8 @@ exit_0: EXPORT_SYMBOL(lz4_decompress); #endif -int lz4_decompress_unknownoutputsize(const char *src, size_t src_len, - char *dest, size_t *dest_len) +int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len, + unsigned char *dest, size_t *dest_len) { int ret = -1; int out_len = 0;