From 38cb47ba0187c481aa949d3bbf149e014e8cacda Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:47:54 +0100 Subject: [PATCH 01/78] x86: relax RAM check in ioremap() Kevin Winchester reported the loss of direct rendering, due to: [ 0.588184] agpgart: Detected AGP bridge 0 [ 0.588184] agpgart: unable to get memory for graphics translation table. [ 0.588184] agpgart: agp_backend_initialize() failed. [ 0.588207] agpgart-amd64: probe of 0000:00:00.0 failed with error -12 and bisected it down to: commit 266b9f8727976769e2ed2dad77ac9295f37e321e Author: Thomas Gleixner Date: Wed Jan 30 13:34:06 2008 +0100 x86: fix ioremap RAM check this check was too strict and caused an ioremap() failure. the problem is due to the somewhat unclean way of how the GART code reserves a memory range for its aperture, and how it utilizes it later on. Allow RAM pages to be ioremap()-ed too, as long as they are reserved. Bisected-by: Kevin Winchester Signed-off-by: Ingo Molnar Tested-by: Kevin Winchester Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index c004d94608fd..1a88d1572a77 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -116,7 +116,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, { void __iomem *addr; struct vm_struct *area; - unsigned long offset, last_addr; + unsigned long pfn, offset, last_addr; pgprot_t prot; /* Don't allow wraparound or zero size */ @@ -133,9 +133,10 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (offset = phys_addr >> PAGE_SHIFT; offset < max_pfn_mapped && - (offset << PAGE_SHIFT) < last_addr; offset++) { - if (page_is_ram(offset)) + for (pfn = phys_addr >> PAGE_SHIFT; pfn < max_pfn_mapped && + (pfn << PAGE_SHIFT) < last_addr; pfn++) { + if (page_is_ram(pfn) && pfn_valid(pfn) && + !PageReserved(pfn_to_page(pfn))) return NULL; } From 262d5ee27271703a0396d63649430f43f3b5deb3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:47:54 +0100 Subject: [PATCH 02/78] x86: VMI fix Jeff Chua bisected down a vmware guest boot breakage (hang) to this paravirt change: commit 8d947344c47a40626730bb80d136d8daac9f2060 Author: Glauber de Oliveira Costa Date: Wed Jan 30 13:31:12 2008 +0100 x86: change write_idt_entry signature fix the off-by-one indexing bug ... Bisected-by: Jeff Chua Tested-by: Jeff Chua Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmi_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 4525bc2c2e19..12affe1f9bce 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -220,21 +220,21 @@ static void vmi_set_tr(void) static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { u32 *idt_entry = (u32 *)g; - vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[2]); + vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]); } static void vmi_write_gdt_entry(struct desc_struct *dt, int entry, const void *desc, int type) { u32 *gdt_entry = (u32 *)desc; - vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[2]); + vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]); } static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[2]); + vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, From 3a900d89db35c133bc0874e71d9156b22db362b4 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Mon, 4 Feb 2008 16:47:55 +0100 Subject: [PATCH 03/78] x86: restore correct module name for apm The apm module were renamed to apm_32 during the merge of 32 and 64 bit x86 which is unfortunate. As apm is 32 bit specific we like to keep the _32 in the filename but the module should be named apm. Fix this in the Makefile. Reported-by: "A.E.Lawrence" Signed-off-by: Sam Ravnborg Cc: Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: "A.E.Lawrence" Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 6f813009d44b..f08063581804 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -37,7 +37,8 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_PCI) += early-quirks.o -obj-$(CONFIG_APM) += apm_32.o +apm-y := apm_32.o +obj-$(CONFIG_APM) += apm.o obj-$(CONFIG_X86_SMP) += smp_$(BITS).o smpboot_$(BITS).o tsc_sync.o obj-$(CONFIG_X86_32_SMP) += smpcommon_32.o obj-$(CONFIG_X86_64_SMP) += smp_64.o smpboot_64.o tsc_sync.o From 3bc9a77e84096148d5ada29c986d6e71a20eaeda Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Mon, 4 Feb 2008 16:47:55 +0100 Subject: [PATCH 04/78] x86: rename module scx200_32 to scx200 The module scx200 were renamed to scx200_32 by the merge of the 32 and 64 bit x86 arch trees. Keep the _32 prefix on the .c file as it is 32 bit specific and fix the module name in the Makefile. Signed-off-by: Sam Ravnborg Cc: "H. Peter Anvin" Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f08063581804..21dc1a061bf1 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -75,7 +75,8 @@ ifdef CONFIG_INPUT_PCSPKR obj-y += pcspeaker.o endif -obj-$(CONFIG_SCx200) += scx200_32.o +obj-$(CONFIG_SCx200) += scx200.o +scx200-y += scx200_32.o ### # 64 bit specific files From 4cf31841762954ad2868156ccba94d798a16630f Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Mon, 4 Feb 2008 16:47:55 +0100 Subject: [PATCH 05/78] x86: mach-rdc321x Kconfig fix The mach-rdc321x uses the leds-gpio driver and explicitely selects it, this driver also depends on the leds class module, select it as well. Signed-off-by: Florian Fainelli Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7109037bdf7c..77198f49b383 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -309,6 +309,7 @@ config X86_RDC321X select M486 select X86_REBOOTFIXUPS select GENERIC_GPIO + select LEDS_CLASS select LEDS_GPIO help This option is needed for RDC R-321x system-on-chip, also known From b50516fc20f756cf4d18a89f6f9977d60151ccba Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:47:55 +0100 Subject: [PATCH 06/78] x86: CPA remove bogus NX clear In split_large_page we clear the NX bit for the new split ptes, but we need to preserve the original setting of it for the split ptes. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index e297bd65e513..877b5cca2cb8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -225,7 +225,6 @@ static int split_large_page(pte_t *kpte, unsigned long address) paravirt_alloc_pt(&init_mm, page_to_pfn(base)); #endif - pgprot_val(ref_prot) &= ~_PAGE_NX; for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); From 6118f76fb7408bad7631345cc41a5f0efc49ce3e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 4 Feb 2008 16:47:56 +0100 Subject: [PATCH 07/78] x86: print out node_data addr and bootmap_start addr print out node_data addr and bootmap_start addr. helpful for debugging early crashes on high-end NUMA systems. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa_64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a920d09b9194..5a02bf4c91ec 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -202,6 +202,8 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, if (node_data[nodeid] == NULL) return; nodedata_phys = __pa(node_data[nodeid]); + printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, + nodedata_phys + pgdat_size - 1); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; @@ -225,12 +227,15 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, return; } bootmap_start = __pa(bootmap); - Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages); bootmap_size = init_bootmem_node(NODE_DATA(nodeid), bootmap_start >> PAGE_SHIFT, start_pfn, end_pfn); + printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", + bootmap_start, bootmap_start + bootmap_size - 1, + bootmap_pages); + free_bootmem_with_active_regions(nodeid, end); reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size); From cf89ec924da5b76cbff293a1b378f312c7161411 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:47:56 +0100 Subject: [PATCH 08/78] x86: reduce ifdef sections in fault.c Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e4440d0abf81..3fff490254a9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -508,6 +508,10 @@ static int vmalloc_fault(unsigned long address) pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; + /* Make sure we are in vmalloc area */ + if (!(address >= VMALLOC_START && address < VMALLOC_END)) + return -1; + /* Copy kernel mappings over when needed. This can also happen within a race in page table update. In the later case just flush. */ @@ -603,6 +607,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) */ #ifdef CONFIG_X86_32 if (unlikely(address >= TASK_SIZE)) { +#else + if (unlikely(address >= TASK_SIZE64)) { +#endif if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && vmalloc_fault(address) >= 0) return; @@ -618,6 +625,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; } + +#ifdef CONFIG_X86_32 /* It's safe to allow irq's after cr2 has been saved and the vmalloc fault has been handled. */ if (regs->flags & (X86_EFLAGS_IF|VM_MASK)) @@ -630,28 +639,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (in_atomic() || !mm) goto bad_area_nosemaphore; #else /* CONFIG_X86_64 */ - if (unlikely(address >= TASK_SIZE64)) { - /* - * Don't check for the module range here: its PML4 - * is always initialized because it's shared with the main - * kernel text. Only vmalloc may need PML4 syncups. - */ - if (!(error_code & (PF_RSVD|PF_USER|PF_PROT)) && - ((address >= VMALLOC_START && address < VMALLOC_END))) { - if (vmalloc_fault(address) >= 0) - return; - } - - /* Can handle a stale RO->RW TLB */ - if (spurious_fault(address, error_code)) - return; - - /* - * Don't take the mm semaphore here. If we fixup a prefetch - * fault we could otherwise deadlock. - */ - goto bad_area_nosemaphore; - } if (likely(regs->flags & X86_EFLAGS_IF)) local_irq_enable(); From 1622ac23bd3568c3ae8bb391dd3adb51887d7141 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 4 Feb 2008 16:47:56 +0100 Subject: [PATCH 09/78] x86: define OBJCOPYFLAGS explicitly for each target. Do this rather than defining a global version and overriding it in almost all cases in order to make subsequent patches simpler. Signed-off-by: Ian Campbell Acked-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/Makefile | 1 - arch/x86/boot/Makefile | 2 +- arch/x86/boot/compressed/Makefile | 1 + 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 8978e98bed5b..364865b1b08d 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -92,7 +92,6 @@ KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) LDFLAGS := -m elf_$(UTS_MACHINE) -OBJCOPYFLAGS := -O binary -R .note -R .comment -S # Speed up the build KBUILD_CFLAGS += -pipe diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 349b81a39c40..254a58398a67 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -80,6 +80,7 @@ $(obj)/zImage $(obj)/bzImage: $(obj)/setup.bin \ $(call if_changed,image) @echo 'Kernel: $@ is ready' ' (#'`cat .version`')' +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE $(call if_changed,objcopy) @@ -90,7 +91,6 @@ $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE $(call if_changed,ld) OBJCOPYFLAGS_setup.bin := -O binary - $(obj)/setup.bin: $(obj)/setup.elf FORCE $(call if_changed,objcopy) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fe24ceabd909..d2b9f3bb87c0 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -22,6 +22,7 @@ $(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $ $(call if_changed,ld) @: +OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) From a34746bc43eb63e545abf5eb002d96483a54ee32 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:56 +0100 Subject: [PATCH 10/78] x86: add _ASM_EXTABLE macro to Instead of open-coding the __ex_table information at each callsite, construct a common macro that can work regardless of CPU size. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/asm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/asm-x86/asm.h b/include/asm-x86/asm.h index 1a6980a60fc6..90dec0c23646 100644 --- a/include/asm-x86/asm.h +++ b/include/asm-x86/asm.h @@ -29,4 +29,11 @@ #endif /* CONFIG_X86_32 */ +/* Exception table entry */ +# define _ASM_EXTABLE(from,to) \ + " .section __ex_table,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .previous\n" + #endif /* _ASM_X86_ASM_H */ From 92909098a3b27147c4b80f9c387ccd63676aa807 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:56 +0100 Subject: [PATCH 11/78] x86: use _ASM_EXTABLE macro in arch/x86/kernel/test_nx.c Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in arch/x86/kernel/test_nx.c. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/test_nx.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index ae0ef2e304c7..36c100c323aa 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -12,6 +12,7 @@ #include #include #include +#include extern int rodata_test_data; @@ -89,16 +90,7 @@ static noinline int test_address(void *address) "2: mov %[zero], %[rslt]\n" " ret\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" -#ifdef CONFIG_X86_32 - " .long 0b\n" - " .long 2b\n" -#else - " .quad 0b\n" - " .quad 2b\n" -#endif - ".previous\n" + _ASM_EXTABLE(0b,2b) : [rslt] "=r" (result) : [fake_code] "r" (address), [zero] "r" (0UL), "0" (result) ); From e7a40d268ec2afab7e0596667cabd2ae53fec8d8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:57 +0100 Subject: [PATCH 12/78] x86: use _ASM_EXTABLE macro in arch/x86/lib/mmx_32.c Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in arch/x86/lib/mmx_32.c. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/lib/mmx_32.c | 31 +++++++------------------------ 1 file changed, 7 insertions(+), 24 deletions(-) diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c index 28084d2e8dd4..cc9b4a4450f3 100644 --- a/arch/x86/lib/mmx_32.c +++ b/arch/x86/lib/mmx_32.c @@ -4,6 +4,7 @@ #include #include +#include #include @@ -50,10 +51,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); @@ -81,10 +79,7 @@ void *_mmx_memcpy(void *to, const void *from, size_t len) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -181,10 +176,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); for(i=0; i<(4096-320)/64; i++) @@ -211,10 +203,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; @@ -311,10 +300,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from) ); for(i=0; i<4096/64; i++) @@ -341,10 +327,7 @@ static void fast_copy_page(void *to, void *from) "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */ " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 4\n" - " .long 1b, 3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : : "r" (from), "r" (to) : "memory"); from+=64; to+=64; From 287774414568010855642518513f085491644061 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:57 +0100 Subject: [PATCH 13/78] x86: use _ASM_EXTABLE macro in arch/x86/lib/usercopy_32.c Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in arch/x86/lib/usercopy_32.c. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/lib/usercopy_32.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 9c4ffd5bedb2..e849b9998b0e 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -48,10 +48,7 @@ do { \ "3: movl %5,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 0b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ "=&D" (__d2) \ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ @@ -132,11 +129,8 @@ do { \ "3: lea 0(%2,%0,4),%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 0b,3b\n" \ - " .long 1b,2b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ + _ASM_EXTABLE(1b,2b) \ : "=&c"(size), "=&D" (__d0) \ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \ } while (0) From 8da804f2b23913ef362c6a578bf482e5ccc93d1a Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:57 +0100 Subject: [PATCH 14/78] x86: use _ASM_EXTABLE macro in arch/x86/lib/usercopy_64.c Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in arch/x86/lib/usercopy_64.c. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/lib/usercopy_64.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 893d43f838cc..0c89d1bb0287 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -31,10 +31,7 @@ do { \ "3: movq %5,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 0b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(0b,3b) \ : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \ "=&D" (__d2) \ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \ @@ -87,11 +84,8 @@ unsigned long __clear_user(void __user *addr, unsigned long size) "3: lea 0(%[size1],%[size8],8),%[size8]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 0b,3b\n" - " .quad 1b,2b\n" - ".previous" + _ASM_EXTABLE(0b,3b) + _ASM_EXTABLE(1b,2b) : [size8] "=c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr), [zero] "r" (0UL), [eight] "r" (8UL)); From f832ff18e886ada0ff30a1edeab082ce218d107e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:58 +0100 Subject: [PATCH 15/78] x86: use _ASM_EXTABLE macro in arch/x86/mm/init_32.c Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in arch/x86/mm/init_32.c. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_32.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index f2f36f8dae52..d1bc04006d16 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -718,10 +719,7 @@ static noinline int do_test_wp_bit(void) "1: movb %1, %0 \n" " xorl %2, %2 \n" "2: \n" - ".section __ex_table, \"a\"\n" - " .align 4 \n" - " .long 1b, 2b \n" - ".previous \n" + _ASM_EXTABLE(1b,2b) :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), "=q" (tmp_reg), "=r" (flag) From 2532ec6d178abc55681d049097d3dc577eaa266c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:58 +0100 Subject: [PATCH 16/78] x86: use _ASM_EXTABLE macro in include/asm-x86/futex.h Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in include/asm-x86/futex.h. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/futex.h | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h index 9d919264923a..cd9f894dd2d7 100644 --- a/include/asm-x86/futex.h +++ b/include/asm-x86/futex.h @@ -17,11 +17,8 @@ "2: .section .fixup,\"ax\"\n \ 3: mov %3, %1\n \ jmp 2b\n \ - .previous\n \ - .section __ex_table,\"a\"\n \ - .align 8\n" \ - _ASM_PTR "1b,3b\n \ - .previous" \ + .previous\n" \ + _ASM_EXTABLE(1b,3b) \ : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \ : "i" (-EFAULT), "0" (oparg), "1" (0)) @@ -35,11 +32,9 @@ 3: .section .fixup,\"ax\"\n \ 4: mov %5, %1\n \ jmp 3b\n \ - .previous\n \ - .section __ex_table,\"a\"\n \ - .align 8\n" \ - _ASM_PTR "1b,4b,2b,4b\n \ - .previous" \ + .previous\n" \ + _ASM_EXTABLE(1b,4b) \ + _ASM_EXTABLE(2b,4b) \ : "=&a" (oldval), "=&r" (ret), "+m" (*uaddr), \ "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) @@ -111,18 +106,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) return -EFAULT; __asm__ __volatile__( - "1: lock; cmpxchgl %3, %1 \n" "2: .section .fixup, \"ax\" \n" "3: mov %2, %0 \n" " jmp 2b \n" " .previous \n" - - " .section __ex_table, \"a\" \n" - " .align 8 \n" - _ASM_PTR " 1b,3b \n" - " .previous \n" - + _ASM_EXTABLE(1b,3b) : "=a" (oldval), "+m" (*uaddr) : "i" (-EFAULT), "r" (newval), "0" (oldval) : "memory" From 92c37fa3256dd8ace1cc37674146abd286e3b8b0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:58 +0100 Subject: [PATCH 17/78] x86: use _ASM_EXTABLE macro in include/asm-x86/i387.h Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in include/asm-x86/i387.h. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/i387.h | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/include/asm-x86/i387.h b/include/asm-x86/i387.h index ba8105ca822b..6b1895ccd6b7 100644 --- a/include/asm-x86/i387.h +++ b/include/asm-x86/i387.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -41,10 +42,7 @@ static inline void tolerant_fwait(void) { asm volatile("1: fwait\n" "2:\n" - " .section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 1b,2b\n" - " .previous\n"); + _ASM_EXTABLE(1b,2b)); } static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) @@ -57,10 +55,7 @@ static inline int restore_fpu_checking(struct i387_fxsave_struct *fx) "3: movl $-1,%[err]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 1b,3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : [err] "=r" (err) #if 0 /* See comment in __save_init_fpu() below. */ : [fx] "r" (fx), "m" (*fx), "0" (0)); @@ -99,10 +94,7 @@ static inline int save_i387_checking(struct i387_fxsave_struct __user *fx) "3: movl $-1,%[err]\n" " jmp 2b\n" ".previous\n" - ".section __ex_table,\"a\"\n" - " .align 8\n" - " .quad 1b,3b\n" - ".previous" + _ASM_EXTABLE(1b,3b) : [err] "=r" (err), "=m" (*fx) #if 0 /* See comment in __fxsave_clear() below. */ : [fx] "r" (fx), "0" (0)); From 7d24a827087e0cf6834a3d8f20c4b5fc4cebd7fc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:58 +0100 Subject: [PATCH 18/78] x86: use _ASM_EXTABLE macro in include/asm-x86/msr.h Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in include/asm-x86/msr.h. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/msr.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/include/asm-x86/msr.h b/include/asm-x86/msr.h index 204a8a30fecf..3ca29ebebbb1 100644 --- a/include/asm-x86/msr.h +++ b/include/asm-x86/msr.h @@ -57,10 +57,7 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, ".section .fixup,\"ax\"\n\t" "3: mov %3,%0 ; jmp 1b\n\t" ".previous\n\t" - ".section __ex_table,\"a\"\n" - _ASM_ALIGN "\n\t" - _ASM_PTR " 2b,3b\n\t" - ".previous" + _ASM_EXTABLE(2b,3b) : "=r" (*err), EAX_EDX_RET(val, low, high) : "c" (msr), "i" (-EFAULT)); return EAX_EDX_VAL(val, low, high); @@ -81,10 +78,7 @@ static inline int native_write_msr_safe(unsigned int msr, ".section .fixup,\"ax\"\n\t" "3: mov %4,%0 ; jmp 1b\n\t" ".previous\n\t" - ".section __ex_table,\"a\"\n" - _ASM_ALIGN "\n\t" - _ASM_PTR " 2b,3b\n\t" - ".previous" + _ASM_EXTABLE(2b,3b) : "=a" (err) : "c" (msr), "0" (low), "d" (high), "i" (-EFAULT)); From 88976ee187dce4c8de56e25955631de9765d96d1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:58 +0100 Subject: [PATCH 19/78] x86: use _ASM_EXTABLE macro in include/asm-x86/system.h Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in include/asm-x86/system.h. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/system.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index ee32ef9367f4..428348e990bf 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -130,10 +130,7 @@ extern void load_gs_index(unsigned); "movl %k1, %%" #seg "\n\t" \ "jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n\t" \ - _ASM_ALIGN "\n\t" \ - _ASM_PTR " 1b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(1b,3b) \ : :"r" (value), "r" (0)) @@ -214,12 +211,10 @@ static inline unsigned long native_read_cr4_safe(void) /* This could fault if %cr4 does not exist. In x86_64, a cr4 always * exists, so it will never fail. */ #ifdef CONFIG_X86_32 - asm volatile("1: mov %%cr4, %0 \n" - "2: \n" - ".section __ex_table,\"a\" \n" - ".long 1b,2b \n" - ".previous \n" - : "=r" (val), "=m" (__force_order) : "0" (0)); + asm volatile("1: mov %%cr4, %0\n" + "2:\n" + _ASM_EXTABLE(1b,2b) + : "=r" (val), "=m" (__force_order) : "0" (0)); #else val = native_read_cr4(); #endif From 14e6d17d683c02c114fccdde3a867033e8781416 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:59 +0100 Subject: [PATCH 20/78] x86: use _ASM_EXTABLE macro in include/asm-x86/uaccess_32.h Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in include/asm-x86/uaccess_32.h. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/uaccess_32.h | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/include/asm-x86/uaccess_32.h b/include/asm-x86/uaccess_32.h index d2a4f7be9c2c..fcc570ec4fee 100644 --- a/include/asm-x86/uaccess_32.h +++ b/include/asm-x86/uaccess_32.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #define VERIFY_READ 0 @@ -287,11 +288,8 @@ extern void __put_user_8(void); "4: movl %3,%0\n" \ " jmp 3b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,4b\n" \ - " .long 2b,4b\n" \ - ".previous" \ + _ASM_EXTABLE(1b,4b) \ + _ASM_EXTABLE(2b,4b) \ : "=r"(err) \ : "A" (x), "r" (addr), "i"(-EFAULT), "0"(err)) @@ -338,10 +336,7 @@ struct __large_struct { unsigned long buf[100]; }; "3: movl %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(1b,3b) \ : "=r"(err) \ : ltype (x), "m"(__m(addr)), "i"(errret), "0"(err)) @@ -378,10 +373,7 @@ do { \ " xor"itype" %"rtype"1,%"rtype"1\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 4\n" \ - " .long 1b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(1b,3b) \ : "=r"(err), ltype (x) \ : "m"(__m(addr)), "i"(errret), "0"(err)) From 71713eeed0c90bb05c509388609223555575f558 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:59 +0100 Subject: [PATCH 21/78] x86: use _ASM_EXTABLE macro in include/asm-x86/uaccess_64.h Use the _ASM_EXTABLE macro from , instead of open-coding __ex_table entires in include/asm-x86/uaccess_64.h. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/uaccess_64.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/include/asm-x86/uaccess_64.h b/include/asm-x86/uaccess_64.h index 31d794702719..b87eb4ba8f9d 100644 --- a/include/asm-x86/uaccess_64.h +++ b/include/asm-x86/uaccess_64.h @@ -181,10 +181,7 @@ struct __large_struct { unsigned long buf[100]; }; "3: mov %3,%0\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 1b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(1b,3b) \ : "=r"(err) \ : ltype (x), "m"(__m(addr)), "i"(errno), "0"(err)) @@ -226,10 +223,7 @@ do { \ " xor"itype" %"rtype"1,%"rtype"1\n" \ " jmp 2b\n" \ ".previous\n" \ - ".section __ex_table,\"a\"\n" \ - " .align 8\n" \ - " .quad 1b,3b\n" \ - ".previous" \ + _ASM_EXTABLE(1b,3b) \ : "=r"(err), ltype (x) \ : "m"(__m(addr)), "i"(errno), "0"(err)) From 2347d933b158932cf2b8aeebae3e5cc16b200bd1 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:59 +0100 Subject: [PATCH 22/78] x86: cpuid: allow querying %ecx-sensitive CPUID levels After /dev/*/cpuid was introduced, Intel changed the semantics of the CPUID instruction to be sentitive to %ecx as well as %eax. This patch allows querying of %ecx-sensitive levels by placing the %ecx value in the upper 32 bits of the file position (lower 32 bits always were used for the %eax value.) Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpuid.c | 45 ++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a63432d800f9..c10ebc4b8e4b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -17,6 +17,10 @@ * and then read in chunks of 16 bytes. A larger size means multiple * reads of consecutive levels. * + * The lower 32 bits of the file position is used as the incoming %eax, + * and the upper 32 bits of the file position as the incoming %ecx, + * the latter intended for "counting" eax levels like eax=4. + * * This driver uses /dev/cpu/%d/cpuid where %d is the minor number, and on * an SMP box will direct the access to CPU %d. */ @@ -43,27 +47,16 @@ static struct class *cpuid_class; -struct cpuid_command { - u32 reg; - u32 *data; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; }; static void cpuid_smp_cpuid(void *cmd_block) { - struct cpuid_command *cmd = cmd_block; + struct cpuid_regs *cmd = (struct cpuid_regs *)cmd_block; - cpuid(cmd->reg, &cmd->data[0], &cmd->data[1], &cmd->data[2], - &cmd->data[3]); -} - -static inline void do_cpuid(int cpu, u32 reg, u32 * data) -{ - struct cpuid_command cmd; - - cmd.reg = reg; - cmd.data = data; - - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + cpuid_count(cmd->eax, cmd->ecx, + &cmd->eax, &cmd->ebx, &cmd->ecx, &cmd->edx); } static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) @@ -93,19 +86,21 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, size_t count, loff_t * ppos) { char __user *tmp = buf; - u32 data[4]; - u32 reg = *ppos; + struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); + u64 pos = *ppos; if (count % 16) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 16) { - do_cpuid(cpu, reg, data); - if (copy_to_user(tmp, &data, 16)) + cmd.eax = pos; + cmd.ecx = pos >> 32; + smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1); + if (copy_to_user(tmp, &cmd, 16)) return -EFAULT; tmp += 16; - *ppos = reg++; + *ppos = ++pos; } return tmp - buf; @@ -193,7 +188,7 @@ static int __init cpuid_init(void) } for_each_online_cpu(i) { err = cpuid_device_create(i); - if (err != 0) + if (err != 0) goto out_class; } register_hotcpu_notifier(&cpuid_class_cpu_notifier); @@ -208,7 +203,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); out: return err; } From 2b06ac867176d5d24757bda7e13f6255d6b96d7b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:47:59 +0100 Subject: [PATCH 23/78] x86: cpuid, msr: use inode mutex instead of big kernel lock Instead of grabbing the BKL on seek, use the inode mutex in the style of generic_file_llseek(). Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpuid.c | 7 +++---- arch/x86/kernel/msr.c | 14 +++++++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index c10ebc4b8e4b..288e7a6598ac 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -62,9 +62,9 @@ static void cpuid_smp_cpuid(void *cmd_block) static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) { loff_t ret; + struct inode *inode = file->f_mapping->host; - lock_kernel(); - + mutex_lock(&inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -77,8 +77,7 @@ static loff_t cpuid_seek(struct file *file, loff_t offset, int orig) default: ret = -EINVAL; } - - unlock_kernel(); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index bd82850e6519..af51ea8400b2 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -1,6 +1,6 @@ /* ----------------------------------------------------------------------- * - * - * Copyright 2000 H. Peter Anvin - All Rights Reserved + * + * Copyright 2000-2008 H. Peter Anvin - All Rights Reserved * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -45,9 +45,10 @@ static struct class *msr_class; static loff_t msr_seek(struct file *file, loff_t offset, int orig) { - loff_t ret = -EINVAL; + loff_t ret; + struct inode *inode = file->f_mapping->host; - lock_kernel(); + mutex_lock(&inode->i_mutex); switch (orig) { case 0: file->f_pos = offset; @@ -56,8 +57,11 @@ static loff_t msr_seek(struct file *file, loff_t offset, int orig) case 1: file->f_pos += offset; ret = file->f_pos; + break; + default: + ret = -EINVAL; } - unlock_kernel(); + mutex_unlock(&inode->i_mutex); return ret; } From 84fb144b928744cea8eb39bb4fbc794fcb749175 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:48:00 +0100 Subject: [PATCH 24/78] x86: reintroduce volatile keyword in prototype to clflush() The volatile keyword was removed from the clflush() prototype in commit e34907ae180f4fe6c28bb4516c679c2f81b0c9ed; the comment there states: x86: remove volatile keyword from clflush. the p parameter is an explicit memory reference, and is enough to prevent gcc to being nasty here. The volatile seems completely not needed. This reflects incorrect understanding of the function of the volatile keyword there. The purpose of the volatile keyword is informing gcc that it is safe to pass a volatile pointer to this function. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/system.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index 428348e990bf..e9c15c97dfe2 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -271,9 +271,9 @@ static inline void native_wbinvd(void) #endif /* __KERNEL__ */ -static inline void clflush(void *__p) +static inline void clflush(volatile void *__p) { - asm volatile("clflush %0" : "+m" (*(char __force *)__p)); + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } #define nop() __asm__ __volatile__ ("nop") From fa1408e4df53ec1e61f59c030b3488a1ef0c635d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:48:00 +0100 Subject: [PATCH 25/78] x86: unify CPU feature string names Move the CPU feature string names to a separate file (common to 32 and 64 bits); additionally, make includable by host code in preparation for including the CPU feature strings in the boot code. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/Makefile | 1 + arch/x86/kernel/cpu/feature_names.c | 83 +++++++++++++++++++++++++++++ arch/x86/kernel/cpu/proc.c | 74 ------------------------- arch/x86/kernel/setup_64.c | 76 -------------------------- include/asm-x86/cpufeature.h | 12 +++-- 5 files changed, 93 insertions(+), 153 deletions(-) create mode 100644 arch/x86/kernel/cpu/feature_names.c diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfdb2f3bd763..a0c4d7c5dbd7 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,6 +3,7 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y += feature_names.o obj-$(CONFIG_X86_32) += common.o proc.o bugs.o obj-$(CONFIG_X86_32) += amd.o diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c new file mode 100644 index 000000000000..ee975ac6bbcb --- /dev/null +++ b/arch/x86/kernel/cpu/feature_names.c @@ -0,0 +1,83 @@ +/* + * Strings for the various x86 capability flags. + * + * This file must not contain any executable code. + */ + +#include "asm/cpufeature.h" + +/* + * These flag bits must match the definitions in . + * NULL means this bit is undefined or reserved; either way it doesn't + * have meaning as far as Linux is concerned. Note that it's important + * to realize there is a difference between this table and CPUID -- if + * applications want to get the raw CPUID data, they should access + * /dev/cpu//cpuid instead. + */ +const char * const x86_cap_flags[NCAPINTS*32] = { + /* Intel-defined */ + "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", + "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", + "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", + + /* AMD-defined */ + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, + NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", + "3dnowext", "3dnow", + + /* Transmeta-defined */ + "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Other (Linux-defined) */ + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", + NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, "arch_perfmon", + "pebs", "bts", NULL, NULL, + "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Intel-defined (#2) */ + "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", + "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, + NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* VIA/Cyrix/Centaur-defined */ + NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", + "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* AMD-defined (#2) */ + "lahf_lm", "cmp_legacy", "svm", "extapic", + "cr8_legacy", "abm", "sse4a", "misalignsse", + "3dnowprefetch", "osvw", "ibs", "sse5", + "skinit", "wdt", NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Auxiliary (Linux-defined) */ + "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, +}; + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + "100mhzsteps", + "hwpstate", + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ +}; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 028213260148..af11d31dce0a 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -10,80 +10,6 @@ */ static int show_cpuinfo(struct seq_file *m, void *v) { - /* - * These flag bits must match the definitions in . - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu//cpuid instead. - */ - static const char * const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char * const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* constant_tsc - moved to flags */ - /* nothing */ - }; struct cpuinfo_x86 *c = v; int i, n = 0; int fpu_exception; diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index 18df70c534b9..c8939dfddfba 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -1068,82 +1068,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) struct cpuinfo_x86 *c = v; int cpu = 0, i; - /* - * These flag bits must match the definitions in . - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu//cpuid instead. - */ - static const char *const x86_cap_flags[] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, "sync_rdtsc", - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - }; - static const char *const x86_power_flags[] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ - }; - - #ifdef CONFIG_SMP cpu = c->cpu_index; #endif diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 3fb7dfa7fc91..3adc9cf0f391 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -4,9 +4,6 @@ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H -#ifndef __ASSEMBLY__ -#include -#endif #include #define NCAPINTS 8 /* N 32-bit words worth of info */ @@ -115,6 +112,13 @@ */ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) + +#include + +extern const char * const x86_cap_flags[NCAPINTS*32]; +extern const char * const x86_power_flags[32]; + #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ @@ -204,4 +208,6 @@ #endif /* CONFIG_X86_64 */ +#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ + #endif /* _ASM_X86_CPUFEATURE_H */ From f0be6c6a697c2fe8e2efbe98cd157bdbcff969ae Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:48:00 +0100 Subject: [PATCH 26/78] x86 setup: print missing CPU features in cleartext Instead of obscure numbers, print the list of missing CPU features in cleartext. To conserve space, use a host program (mkcpustr.c) to produce a compact list of mandatory features only. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/boot/Makefile | 16 +++++++++---- arch/x86/boot/cpu.c | 26 +++++++++++++++++---- arch/x86/boot/mkcpustr.c | 49 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 arch/x86/boot/mkcpustr.c diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 254a58398a67..f88458e83ef0 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,7 +26,7 @@ SVGA_MODE := -DSVGA_MODE=NORMAL_VGA #RAMDISK := -DRAMDISK=512 targets := vmlinux.bin setup.bin setup.elf zImage bzImage -subdir- := compressed +subdir- := compressed setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o @@ -43,9 +43,17 @@ setup-y += video-vesa.o setup-y += video-bios.o targets += $(setup-y) -hostprogs-y := tools/build +hostprogs-y := mkcpustr tools/build -HOSTCFLAGS_build.o := $(LINUXINCLUDE) +HOST_EXTRACFLAGS += $(LINUXINCLUDE) + +$(obj)/cpu.o: $(obj)/cpustr.h + +quiet_cmd_cpustr = CPUSTR $@ + cmd_cpustr = $(obj)/mkcpustr > $@ +targets += cpustr.h +$(obj)/cpustr.h: $(obj)/mkcpustr FORCE + $(call if_changed,cpustr) # --------------------------------------------------------------------------- @@ -98,7 +106,7 @@ $(obj)/compressed/vmlinux: FORCE $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel -FDARGS = +FDARGS = # Set this if you want an initrd included with the zdisk/fdimage/isoimage kernel FDINITRD = diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 2a5c32da5852..00e19edd852c 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -1,7 +1,7 @@ /* -*- linux-c -*- ------------------------------------------------------- * * * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2007-2008 rPath, Inc. - All Rights Reserved * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -9,7 +9,7 @@ * ----------------------------------------------------------------------- */ /* - * arch/i386/boot/cpu.c + * arch/x86/boot/cpu.c * * Check for obligatory CPU features and abort if the features are not * present. @@ -19,6 +19,8 @@ #include "bitops.h" #include +#include "cpustr.h" + static char *cpu_name(int level) { static char buf[6]; @@ -35,6 +37,7 @@ int validate_cpu(void) { u32 *err_flags; int cpu_level, req_level; + const unsigned char *msg_strs; check_cpu(&cpu_level, &req_level, &err_flags); @@ -51,13 +54,26 @@ int validate_cpu(void) puts("This kernel requires the following features " "not present on the CPU:\n"); + msg_strs = (const unsigned char *)x86_cap_strs; + for (i = 0; i < NCAPINTS; i++) { u32 e = err_flags[i]; for (j = 0; j < 32; j++) { - if (e & 1) - printf("%d:%d ", i, j); - + int n = (i << 5)+j; + if (*msg_strs < n) { + /* Skip to the next string */ + do { + msg_strs++; + } while (*msg_strs); + msg_strs++; + } + if (e & 1) { + if (*msg_strs == n && msg_strs[1]) + printf("%s ", msg_strs+1); + else + printf("%d:%d ", i, j); + } e >>= 1; } } diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c new file mode 100644 index 000000000000..bbe76953bae9 --- /dev/null +++ b/arch/x86/boot/mkcpustr.c @@ -0,0 +1,49 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2008 rPath, Inc. - All Rights Reserved + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * This is a host program to preprocess the CPU strings into a + * compact format suitable for the setup code. + */ + +#include + +#include "../kernel/cpu/feature_names.c" + +#if NCAPFLAGS > 8 +# error "Need to adjust the boot code handling of CPUID strings" +#endif + +int main(void) +{ + int i; + const char *str; + + printf("static const char x86_cap_strs[] = \n"); + + for (i = 0; i < NCAPINTS*32; i++) { + str = x86_cap_flags[i]; + + if (i == NCAPINTS*32-1) { + /* The last entry must be unconditional; this + also consumes the compiler-added null character */ + if (!str) + str = ""; + printf("\t\"\\x%02x\"\"%s\"\n", i, str); + } else if (str) { + printf("#if REQUIRED_MASK%d & (1 << %d)\n" + "\t\"\\x%02x\"\"%s\\0\"\n" + "#endif\n", + i >> 5, i & 31, i, str); + } + } + printf("\t;\n"); + return 0; +} From e1adbcf10608c83de6a81a02ebce859611433b52 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 4 Feb 2008 16:48:00 +0100 Subject: [PATCH 27/78] asm-generic/tlb.h: remove Remove unused from ; per Christoph Lameter this should have been part of a previous patch reversal but apparently didn't get removed. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-generic/tlb.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 75f2bfab614f..f490e43a90b9 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -14,7 +14,6 @@ #define _ASM_GENERIC__TLB_H #include -#include #include #include From 9a6b344ea967efa0bb5ca4cb5405f840652b66c4 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:01 +0100 Subject: [PATCH 28/78] x86: remove long dead cyrix mtrr code cyrix_arr_init was #if 0 all the way back to at least v2.6.12. This was the only place where arr3_protected was set to anything but zero. Eliminate this variable. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/cyrix.c | 107 ------------------------------- arch/x86/kernel/cpu/mtrr/main.c | 12 ---- 2 files changed, 119 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 8e139c70f888..ff14c320040c 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -7,8 +7,6 @@ #include #include "mtrr.h" -int arr3_protected; - static void cyrix_get_arr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type * type) @@ -99,8 +97,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) case 4: return replace_reg; case 3: - if (arr3_protected) - break; case 2: case 1: case 0: @@ -115,8 +111,6 @@ cyrix_get_free_region(unsigned long base, unsigned long size, int replace_reg) } else { for (i = 0; i < 7; i++) { cyrix_get_arr(i, &lbase, &lsize, <ype); - if ((i == 3) && arr3_protected) - continue; if (lsize == 0) return i; } @@ -260,107 +254,6 @@ static void cyrix_set_all(void) post_set(); } -#if 0 -/* - * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection - * with the SMM (System Management Mode) mode. So we need the following: - * Check whether SMI_LOCK (CCR3 bit 0) is set - * if it is set, write a warning message: ARR3 cannot be changed! - * (it cannot be changed until the next processor reset) - * if it is reset, then we can change it, set all the needed bits: - * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset) - * - disable access to SMM memory (CCR1 bit 2 reset) - * - disable SMM mode (CCR1 bit 1 reset) - * - disable write protection of ARR3 (CCR6 bit 1 reset) - * - (maybe) disable ARR3 - * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) - */ -static void __init -cyrix_arr_init(void) -{ - struct set_mtrr_context ctxt; - unsigned char ccr[7]; - int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 }; -#ifdef CONFIG_SMP - int i; -#endif - - /* flush cache and enable MAPEN */ - set_mtrr_prepare_save(&ctxt); - set_mtrr_cache_disable(&ctxt); - - /* Save all CCRs locally */ - ccr[0] = getCx86(CX86_CCR0); - ccr[1] = getCx86(CX86_CCR1); - ccr[2] = getCx86(CX86_CCR2); - ccr[3] = ctxt.ccr3; - ccr[4] = getCx86(CX86_CCR4); - ccr[5] = getCx86(CX86_CCR5); - ccr[6] = getCx86(CX86_CCR6); - - if (ccr[3] & 1) { - ccrc[3] = 1; - arr3_protected = 1; - } else { - /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and - * access to SMM memory through ARR3 (bit 7). - */ - if (ccr[1] & 0x80) { - ccr[1] &= 0x7f; - ccrc[1] |= 0x80; - } - if (ccr[1] & 0x04) { - ccr[1] &= 0xfb; - ccrc[1] |= 0x04; - } - if (ccr[1] & 0x02) { - ccr[1] &= 0xfd; - ccrc[1] |= 0x02; - } - arr3_protected = 0; - if (ccr[6] & 0x02) { - ccr[6] &= 0xfd; - ccrc[6] = 1; /* Disable write protection of ARR3 */ - setCx86(CX86_CCR6, ccr[6]); - } - /* Disable ARR3. This is safe now that we disabled SMM. */ - /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ - } - /* If we changed CCR1 in memory, change it in the processor, too. */ - if (ccrc[1]) - setCx86(CX86_CCR1, ccr[1]); - - /* Enable ARR usage by the processor */ - if (!(ccr[5] & 0x20)) { - ccr[5] |= 0x20; - ccrc[5] = 1; - setCx86(CX86_CCR5, ccr[5]); - } -#ifdef CONFIG_SMP - for (i = 0; i < 7; i++) - ccr_state[i] = ccr[i]; - for (i = 0; i < 8; i++) - cyrix_get_arr(i, - &arr_state[i].base, &arr_state[i].size, - &arr_state[i].type); -#endif - - set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */ - - if (ccrc[5]) - printk(KERN_INFO "mtrr: ARR usage was not enabled, enabled manually\n"); - if (ccrc[3]) - printk(KERN_INFO "mtrr: ARR3 cannot be changed\n"); -/* - if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n"); - if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n"); - if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n"); -*/ - if (ccrc[6]) - printk(KERN_INFO "mtrr: ARR3 was write protected, unprotected\n"); -} -#endif - static struct mtrr_ops cyrix_mtrr_ops = { .vendor = X86_VENDOR_CYRIX, // .init = cyrix_arr_init, diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 715919582657..822d8f90c1eb 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -59,12 +59,6 @@ struct mtrr_ops * mtrr_if = NULL; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -#ifndef CONFIG_X86_64 -extern int arr3_protected; -#else -#define arr3_protected 0 -#endif - void set_mtrr_ops(struct mtrr_ops * ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) @@ -513,12 +507,6 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size) printk(KERN_WARNING "mtrr: register: %d too big\n", reg); goto out; } - if (is_cpu(CYRIX) && !use_intel()) { - if ((reg == 3) && arr3_protected) { - printk(KERN_WARNING "mtrr: ARR3 cannot be changed\n"); - goto out; - } - } mtrr_if->get(reg, &lbase, &lsize, <ype); if (lsize < 1) { printk(KERN_WARNING "mtrr: MTRR %d not used\n", reg); From 9b4239346136f1432e52d14ea88f4b2662876f4a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:01 +0100 Subject: [PATCH 29/78] x86: sparse errors from string_32.h include/asm/string_32.h:216:26: warning: cast truncates bits from constant value (cccccccc becomes cc) include/asm/string_32.h:219:27: warning: cast truncates bits from constant value (cccccccc becomes cccc) include/asm/string_32.h:222:27: warning: cast truncates bits from constant value (cccccccc becomes cccc) include/asm/string_32.h:223:30: warning: cast truncates bits from constant value (cccccccc becomes cc) Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/string_32.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-x86/string_32.h b/include/asm-x86/string_32.h index 55bfa308f900..c5d13a86dea7 100644 --- a/include/asm-x86/string_32.h +++ b/include/asm-x86/string_32.h @@ -213,14 +213,14 @@ static __always_inline void * __constant_c_and_count_memset(void * s, unsigned l case 0: return s; case 1: - *(unsigned char *)s = pattern; + *(unsigned char *)s = pattern & 0xff; return s; case 2: - *(unsigned short *)s = pattern; + *(unsigned short *)s = pattern & 0xffff; return s; case 3: - *(unsigned short *)s = pattern; - *(2+(unsigned char *)s) = pattern; + *(unsigned short *)s = pattern & 0xffff; + *(2+(unsigned char *)s) = pattern & 0xff; return s; case 4: *(unsigned long *)s = pattern; From 94a8a7acbe4d9aa83d53597516cc71101ebd2f6d Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Mon, 4 Feb 2008 16:48:01 +0100 Subject: [PATCH 30/78] x86: remove misleading comments in trampoline_*.S Both trampolines actually *do* set up stack. (Is the "we jump into compressed/head.S" comment still true?) Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/trampoline_32.S | 7 +------ arch/x86/kernel/trampoline_64.S | 3 --- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 9bcc1c6aca3d..64580679861e 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -11,12 +11,7 @@ * trampoline page to make our stack and everything else * is a mystery. * - * In fact we don't actually need a stack so we don't - * set one up. - * - * We jump into the boot/compressed/head.S code. So you'd - * better be running a compressed kernel image or you - * won't get very far. + * We jump into arch/x86/kernel/head_32.S. * * On entry to trampoline_data, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index e30b67c6a9f5..4aedd0bcee4c 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -10,9 +10,6 @@ * trampoline page to make our stack and everything else * is a mystery. * - * In fact we don't actually need a stack so we don't - * set one up. - * * On entry to trampoline_data, the processor is in real mode * with 16-bit addressing and 16-bit data. CS has some value * and IP is zero. Thus, data addresses need to be absolute From c66315e0a785e95884b23887c1aa479dc0b32beb Mon Sep 17 00:00:00 2001 From: Rob Landley Date: Mon, 4 Feb 2008 16:48:02 +0100 Subject: [PATCH 31/78] documentation: add Documentation/x86-64/00-INDEX Signed-off-by: Rob Landley Cc: Vojtech Pavlik Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- Documentation/x86_64/00-INDEX | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 Documentation/x86_64/00-INDEX diff --git a/Documentation/x86_64/00-INDEX b/Documentation/x86_64/00-INDEX new file mode 100644 index 000000000000..92fc20ab5f0e --- /dev/null +++ b/Documentation/x86_64/00-INDEX @@ -0,0 +1,16 @@ +00-INDEX + - This file +boot-options.txt + - AMD64-specific boot options. +cpu-hotplug-spec + - Firmware support for CPU hotplug under Linux/x86-64 +fake-numa-for-cpusets + - Using numa=fake and CPUSets for Resource Management +kernel-stacks + - Context-specific per-processor interrupt stacks. +machinecheck + - Configurable sysfs parameters for the x86-64 machine check code. +mm.txt + - Memory layout of x86-64 (4 level page tables, 46 bits physical). +uefi.txt + - Booting Linux via Unified Extensible Firmware Interface. From e618c9579c745742c422b7c3de1f802aa67e6110 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 4 Feb 2008 16:48:02 +0100 Subject: [PATCH 32/78] x86: unify PAE/non-PAE pgd_ctor The constructors for PAE and non-PAE pgd_ctors are more or less identical, and can be made into the same function. Signed-off-by: Jeremy Fitzhardinge Cc: William Irwin Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable_32.c | 56 +++++++++++++++------------------------- 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cb3aa470249b..f34e33d18443 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -219,50 +219,39 @@ static inline void pgd_list_del(pgd_t *pgd) list_del(&page->lru); } +#define UNSHARED_PTRS_PER_PGD \ + (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - -#if (PTRS_PER_PMD == 1) -/* Non-PAE pgd constructor */ -static void pgd_ctor(void *pgd) +static void pgd_ctor(void *p) { + pgd_t *pgd = p; unsigned long flags; - /* !PAE, no pagetable sharing */ + /* Clear usermode parts of PGD */ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); spin_lock_irqsave(&pgd_lock, flags); - /* must happen under lock */ - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, - __pa(swapper_pg_dir) >> PAGE_SHIFT, - USER_PTRS_PER_PGD, - KERNEL_PGD_PTRS); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); -} -#else /* PTRS_PER_PMD > 1 */ -/* PAE pgd constructor */ -static void pgd_ctor(void *pgd) -{ - /* PAE, kernel PMD may be shared */ - - if (SHARED_KERNEL_PMD) { - clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + /* If the pgd points to a shared pagetable level (either the + ptes in non-PAE, or shared PMD in PAE), then just copy the + references from swapper_pg_dir. */ + if (PAGETABLE_LEVELS == 2 || + (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) { + clone_pgd_range(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, KERNEL_PGD_PTRS); - } else { - unsigned long flags; - - memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); - spin_lock_irqsave(&pgd_lock, flags); - pgd_list_add(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, + __pa(swapper_pg_dir) >> PAGE_SHIFT, + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); } + + /* list required to sync kernel mapping updates */ + if (!SHARED_KERNEL_PMD) + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); } -#endif /* PTRS_PER_PMD */ static void pgd_dtor(void *pgd) { @@ -276,9 +265,6 @@ static void pgd_dtor(void *pgd) spin_unlock_irqrestore(&pgd_lock, flags); } -#define UNSHARED_PTRS_PER_PGD \ - (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) - #ifdef CONFIG_X86_PAE /* * Mop up any pmd pages which may still be attached to the pgd. From a67ad9c9f82342a9b320fdad204a490727ef4a18 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 4 Feb 2008 16:48:02 +0100 Subject: [PATCH 33/78] x86: revert "defer cr3 reload when doing pud_clear()" Revert "defer cr3 reload when doing pud_clear()" since I'm going to replace it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pgtable_32.c | 7 ------- include/asm-x86/pgtable-3level.h | 21 ++++++--------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index f34e33d18443..c7db504be1ea 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -373,13 +373,6 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { - /* This is called just after the pmd has been detached from - the pgd, which requires a full tlb flush to be recognized - by the CPU. Rather than incurring multiple tlb flushes - while the address space is being pulled down, make the tlb - gathering machinery do a full flush when we're done. */ - tlb->fullmm = 1; - paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } diff --git a/include/asm-x86/pgtable-3level.h b/include/asm-x86/pgtable-3level.h index a195c3e757b9..ed4c6f0e57ec 100644 --- a/include/asm-x86/pgtable-3level.h +++ b/include/asm-x86/pgtable-3level.h @@ -96,23 +96,14 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); /* - * In principle we need to do a cr3 reload here to make sure - * the processor recognizes the changed pgd. In practice, all - * the places where pud_clear() gets called are followed by - * full tlb flushes anyway, so we can defer the cost here. + * Pentium-II erratum A13: in PAE mode we explicitly have to flush + * the TLB via cr3 if the top-level pgd is changed... * - * Specifically: - * - * mm/memory.c:free_pmd_range() - immediately after the - * pud_clear() it does a pmd_free_tlb(). We change the - * mmu_gather structure to do a full tlb flush (which has the - * effect of reloading cr3) when the pagetable free is - * complete. - * - * arch/x86/mm/hugetlbpage.c:huge_pmd_unshare() - the call to - * this is followed by a flush_tlb_range, which on x86 does a - * full tlb flush. + * XXX I don't think we need to worry about this here, since + * when clearing the pud, the calling code needs to flush the + * tlb anyway. But do it now for safety's sake. - jsgf */ + write_cr3(read_cr3()); } #define pud_page(pud) \ From edd6bcd8209c31b91e1fbc112a756475091c483d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 4 Feb 2008 16:48:02 +0100 Subject: [PATCH 34/78] x86: pud_clear: only reload cr3 if necessary Rather than unconditionally reloading cr3, only do so if the pud we're updating is within the active pgd. This eliminates TLB flushes most of the time. The performance-critical uses of pud_clear are during execve and exit, but in those cases cr3 is referring to some other pagetable. The only other use of pud_clear is during a large (1Gbyte+) munmap, and those are sufficiently rare that a couple of cr3 reloads won't hurt. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/pgtable-3level.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/asm-x86/pgtable-3level.h b/include/asm-x86/pgtable-3level.h index ed4c6f0e57ec..ad71960bca3a 100644 --- a/include/asm-x86/pgtable-3level.h +++ b/include/asm-x86/pgtable-3level.h @@ -93,17 +93,20 @@ static inline void native_pmd_clear(pmd_t *pmd) static inline void pud_clear(pud_t *pudp) { + unsigned long pgd; + set_pud(pudp, __pud(0)); /* * Pentium-II erratum A13: in PAE mode we explicitly have to flush * the TLB via cr3 if the top-level pgd is changed... * - * XXX I don't think we need to worry about this here, since - * when clearing the pud, the calling code needs to flush the - * tlb anyway. But do it now for safety's sake. - jsgf + * Make sure the pud entry we're updating is within the + * current pgd to avoid unnecessary TLB flushes. */ - write_cr3(read_cr3()); + pgd = read_cr3(); + if (__pa(pudp) >= pgd && __pa(pudp) < (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) + write_cr3(pgd); } #define pud_page(pud) \ From f5430f93257d336346a9018c915e879ce43f5f89 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 4 Feb 2008 16:48:02 +0100 Subject: [PATCH 35/78] x86: update reference for PAE tlb flushing Remove bogus reference to "Pentium-II erratum A13" and point to the actual canonical source of information about what requirements x86 processors have for PAE pagetable updates. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/pgalloc_32.h | 6 ++++-- include/asm-x86/pgtable-3level.h | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h index 7641e7b5d931..6c21ef951dab 100644 --- a/include/asm-x86/pgalloc_32.h +++ b/include/asm-x86/pgalloc_32.h @@ -80,8 +80,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT)); /* - * Pentium-II erratum A13: in PAE mode we explicitly have to flush - * the TLB via cr3 if the top-level pgd is changed... + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... */ if (mm == current->active_mm) write_cr3(read_cr3()); diff --git a/include/asm-x86/pgtable-3level.h b/include/asm-x86/pgtable-3level.h index ad71960bca3a..1d763eec740f 100644 --- a/include/asm-x86/pgtable-3level.h +++ b/include/asm-x86/pgtable-3level.h @@ -98,8 +98,10 @@ static inline void pud_clear(pud_t *pudp) set_pud(pudp, __pud(0)); /* - * Pentium-II erratum A13: in PAE mode we explicitly have to flush - * the TLB via cr3 if the top-level pgd is changed... + * According to Intel App note "TLBs, Paging-Structure Caches, + * and Their Invalidation", April 2007, document 317080-001, + * section 8.1: in PAE mode we explicitly have to flush the + * TLB via cr3 if the top-level pgd is changed... * * Make sure the pud entry we're updating is within the * current pgd to avoid unnecessary TLB flushes. From fa0c864d998c9c97d11db097d5736028d5c80985 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 4 Feb 2008 16:48:03 +0100 Subject: [PATCH 36/78] x86: cleanup - eliminate numbers in LDT allocation code This patch eliminates numbers in LDT allocation code trying to make it clear to understand from where these numbers come. No code changed: text data bss dec hex filename 1896 0 0 1896 768 ldt.o.before 1896 0 0 1896 768 ldt.o.after md5: 6cbec8705008ddb4b704aade60bceda3 ldt.o.before.asm 6cbec8705008ddb4b704aade60bceda3 ldt.o.after.asm Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ldt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 8a7660c8394a..0224c3637c73 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -35,7 +35,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (mincount <= pc->size) return 0; oldsize = pc->size; - mincount = (mincount + 511) & (~511); + mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & + (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) newldt = vmalloc(mincount * LDT_ENTRY_SIZE); else From c7e844f0415252c7e1a2153a97e7a0c511d61ada Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:03 +0100 Subject: [PATCH 37/78] x86: move NUMAQ io handling into arch/x86/pci/numa.c numa.c is the only user of the {in,out}*_quad functions. And it has only a few call sites. Change them to open code the magic NUMAQ port access. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/pci/numa.c | 52 ++++++++++++++++++++++---- include/asm-x86/mach-numaq/mach_apic.h | 2 + 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c index f5f165f69e0c..55270c26237c 100644 --- a/arch/x86/pci/numa.c +++ b/arch/x86/pci/numa.c @@ -5,36 +5,62 @@ #include #include #include +#include #include "pci.h" +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ + #define BUS2QUAD(global) (mp_bus_id_to_node[global]) #define BUS2LOCAL(global) (mp_bus_id_to_local[global]) #define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) +extern void *xquad_portio; /* Where the IO area was mapped */ +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + #define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) +static void write_cf8(unsigned bus, unsigned devfn, unsigned reg) +{ + unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg); + if (xquad_portio) + writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus))); + else + outl(val, 0xCF8); +} + static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 *value) { unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); - outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + write_cf8(bus, devfn, reg); switch (len) { case 1: - *value = inb_quad(0xCFC + (reg & 3), BUS2QUAD(bus)); + if (xquad_portio) + *value = readb(adr + (reg & 3)); + else + *value = inb(0xCFC + (reg & 3)); break; case 2: - *value = inw_quad(0xCFC + (reg & 2), BUS2QUAD(bus)); + if (xquad_portio) + *value = readw(adr + (reg & 2)); + else + *value = inw(0xCFC + (reg & 2)); break; case 4: - *value = inl_quad(0xCFC, BUS2QUAD(bus)); + if (xquad_portio) + *value = readl(adr); + else + *value = inl(0xCFC); break; } @@ -47,23 +73,33 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, unsigned int devfn, int reg, int len, u32 value) { unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) return -EINVAL; spin_lock_irqsave(&pci_config_lock, flags); - outl_quad(PCI_CONF1_MQ_ADDRESS(bus, devfn, reg), 0xCF8, BUS2QUAD(bus)); + write_cf8(bus, devfn, reg); switch (len) { case 1: - outb_quad((u8)value, 0xCFC + (reg & 3), BUS2QUAD(bus)); + if (xquad_portio) + writeb(value, adr + (reg & 3)); + else + outb((u8)value, 0xCFC + (reg & 3)); break; case 2: - outw_quad((u16)value, 0xCFC + (reg & 2), BUS2QUAD(bus)); + if (xquad_portio) + writew(value, adr + (reg & 2)); + else + outw((u16)value, 0xCFC + (reg & 2)); break; case 4: - outl_quad((u32)value, 0xCFC, BUS2QUAD(bus)); + if (xquad_portio) + writel(value, adr + reg); + else + outl((u32)value, 0xCFC); break; } diff --git a/include/asm-x86/mach-numaq/mach_apic.h b/include/asm-x86/mach-numaq/mach_apic.h index 17e183bd39c1..3b637fac890b 100644 --- a/include/asm-x86/mach-numaq/mach_apic.h +++ b/include/asm-x86/mach-numaq/mach_apic.h @@ -109,6 +109,8 @@ static inline int mpc_apic_id(struct mpc_config_processor *m, return logical_apicid; } +extern void *xquad_portio; + static inline void setup_portio_remap(void) { int num_quads = num_online_nodes(); From 1fba38703d0ce8a5ff0fad9df3eccc6b55cf2cfb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:03 +0100 Subject: [PATCH 38/78] x86: remove special NUMAQ support in io_32.h Now that the only user does it on its own remove the NUMAQ support macros in io_32.h The next step would be to convert the preprocessor mess to actually readable standard inlines. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/io_32.h | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/include/asm-x86/io_32.h b/include/asm-x86/io_32.h index 586d7aa54ceb..58d2c45cd0b1 100644 --- a/include/asm-x86/io_32.h +++ b/include/asm-x86/io_32.h @@ -275,29 +275,6 @@ static inline void slow_down_io(void) { #endif -#ifdef CONFIG_X86_NUMAQ -extern void *xquad_portio; /* Where the IO area was mapped */ -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) -#define __BUILDIO(bwl,bw,type) \ -static inline void out##bwl##_quad(unsigned type value, int port, int quad) { \ - if (xquad_portio) \ - write##bwl(value, XQUAD_PORT_ADDR(port, quad)); \ - else \ - out##bwl##_local(value, port); \ -} \ -static inline void out##bwl(unsigned type value, int port) { \ - out##bwl##_quad(value, port, 0); \ -} \ -static inline unsigned type in##bwl##_quad(int port, int quad) { \ - if (xquad_portio) \ - return read##bwl(XQUAD_PORT_ADDR(port, quad)); \ - else \ - return in##bwl##_local(port); \ -} \ -static inline unsigned type in##bwl(int port) { \ - return in##bwl##_quad(port, 0); \ -} -#else #define __BUILDIO(bwl,bw,type) \ static inline void out##bwl(unsigned type value, int port) { \ out##bwl##_local(value, port); \ @@ -305,8 +282,6 @@ static inline void out##bwl(unsigned type value, int port) { \ static inline unsigned type in##bwl(int port) { \ return in##bwl##_local(port); \ } -#endif - #define BUILDIO(bwl,bw,type) \ static inline void out##bwl##_local(unsigned type value, int port) { \ From 599db4fe23d3869af98e2addef5628faef550f60 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:03 +0100 Subject: [PATCH 39/78] x86: remove final FASTCALL() uses A few snuck back in to x86. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/highmem.h | 4 ++-- include/asm-x86/hw_irq_32.h | 2 +- include/asm-x86/system.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/asm-x86/highmem.h b/include/asm-x86/highmem.h index 13cdcd66fff2..c25cfcaab589 100644 --- a/include/asm-x86/highmem.h +++ b/include/asm-x86/highmem.h @@ -63,8 +63,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +extern void *kmap_high(struct page *page); +extern void kunmap_high(struct page *page); void *kmap(struct page *page); void kunmap(struct page *page); diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h index 6d65fbb6358b..ea88054e03f3 100644 --- a/include/asm-x86/hw_irq_32.h +++ b/include/asm-x86/hw_irq_32.h @@ -47,7 +47,7 @@ void enable_8259A_irq(unsigned int irq); int i8259A_irq_pending(unsigned int irq); void make_8259A_irq(unsigned int irq); void init_8259A(int aeoi); -void FASTCALL(send_IPI_self(int vector)); +void send_IPI_self(int vector); void init_VISWS_APIC_irqs(void); void setup_IO_APIC(void); void disable_IO_APIC(void); diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h index e9c15c97dfe2..9cff02ffe6c2 100644 --- a/include/asm-x86/system.h +++ b/include/asm-x86/system.h @@ -20,8 +20,8 @@ #ifdef CONFIG_X86_32 struct task_struct; /* one of the stranger aspects of C forward declarations */ -extern struct task_struct *FASTCALL(__switch_to(struct task_struct *prev, - struct task_struct *next)); +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); /* * Saving eflags is important. It switches not only IOPL between tasks, From 73bdb73f6666228289af4be55a77e2ed978061a7 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:04 +0100 Subject: [PATCH 40/78] x86: add include to cpu/intel.c Fixes sparse warning: arch/x86/kernel/cpu/intel.c:48:15: warning: symbol 'ppro_with_ram_bug' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/intel.c | 1 + include/asm-x86/bugs.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index d1c372b018db..fae31ce747bd 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "cpu.h" diff --git a/include/asm-x86/bugs.h b/include/asm-x86/bugs.h index 3fcc30dc0731..021cbdd5f258 100644 --- a/include/asm-x86/bugs.h +++ b/include/asm-x86/bugs.h @@ -2,6 +2,6 @@ #define _ASM_X86_BUGS_H extern void check_bugs(void); -extern int ppro_with_ram_bug(void); +int ppro_with_ram_bug(void); #endif /* _ASM_X86_BUGS_H */ From e04f99c987a82f075fcc2bceda351d7610802a88 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:04 +0100 Subject: [PATCH 41/78] x86: add function prototype to vm86.h Global functions should include their prototypes. Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/vm86.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/asm-x86/vm86.h b/include/asm-x86/vm86.h index a5edf517b992..c92fe4af52e8 100644 --- a/include/asm-x86/vm86.h +++ b/include/asm-x86/vm86.h @@ -195,6 +195,7 @@ struct kernel_vm86_struct { void handle_vm86_fault(struct kernel_vm86_regs *, long); int handle_vm86_trap(struct kernel_vm86_regs *, long, int); +struct pt_regs *save_v86_state(struct kernel_vm86_regs *); struct task_struct; void release_vm86_irqs(struct task_struct *); From 7bb308a1eae2a3b869c498017aed15a699d80799 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:04 +0100 Subject: [PATCH 42/78] x86: small sparse fix in process_32.c arch/x86/kernel/process_32.c:254:43: warning: Using plain integer as NULL pointer Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 968371ab223a..dabdbeff1f77 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -251,7 +251,7 @@ void cpu_idle_wait(void) * because it has nothing to do. * Give all the remaining CPUS a kick. */ - smp_call_function_mask(map, do_nothing, 0, 0); + smp_call_function_mask(map, do_nothing, NULL, 0); } while (!cpus_empty(map)); set_cpus_allowed(current, tmp); From b6d549a2967881af4f02d02062acbfeb807d44b4 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:04 +0100 Subject: [PATCH 43/78] x86: add cpu init function defintions to cpu.h cpu.h was already included everywhere needed. Fixes following sparse warnings: arch/x86/kernel/cpu/amd.c:343:12: warning: symbol 'amd_init_cpu' was not declared. Should it be static? arch/x86/kernel/cpu/cyrix.c:444:12: warning: symbol 'cyrix_init_cpu' was not declared. Should it be static? arch/x86/kernel/cpu/cyrix.c:456:12: warning: symbol 'nsc_init_cpu' was not declared. Should it be static? arch/x86/kernel/cpu/centaur.c:467:12: warning: symbol 'centaur_init_cpu' was not declared. Should it be static? arch/x86/kernel/cpu/transmeta.c:112:12: warning: symbol 'transmeta_init_cpu' was not declared. Should it be static? arch/x86/kernel/cpu/intel.c:296:12: warning: symbol 'intel_cpu_init' was not declared. Should it be static? arch/x86/kernel/cpu/nexgen.c:56:12: warning: symbol 'nexgen_init_cpu' was not declared. Should it be static? arch/x86/kernel/cpu/umc.c:22:12: warning: symbol 'umc_init_cpu' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/common.c | 10 ---------- arch/x86/kernel/cpu/cpu.h | 9 +++++++++ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b7b2142b58e7..d9313d9adced 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -623,16 +623,6 @@ cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; * They will insert themselves into the cpu_devs structure. * Then, when cpu_init() is called, we can just iterate over that array. */ - -extern int intel_cpu_init(void); -extern int cyrix_init_cpu(void); -extern int nsc_init_cpu(void); -extern int amd_init_cpu(void); -extern int centaur_init_cpu(void); -extern int transmeta_init_cpu(void); -extern int nexgen_init_cpu(void); -extern int umc_init_cpu(void); - void __init early_cpu_init(void) { intel_cpu_init(); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index ad6527a5beb1..e0b38c33d842 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -27,3 +27,12 @@ extern void display_cacheinfo(struct cpuinfo_x86 *c); extern void early_init_intel(struct cpuinfo_x86 *c); extern void early_init_amd(struct cpuinfo_x86 *c); +/* Specific CPU type init functions */ +int intel_cpu_init(void); +int amd_init_cpu(void); +int cyrix_init_cpu(void); +int nsc_init_cpu(void); +int centaur_init_cpu(void); +int transmeta_init_cpu(void); +int nexgen_init_cpu(void); +int umc_init_cpu(void); From 16c02ed74361433a4fc5d8bd5f67abbac6e1c5ca Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 44/78] x86: add cpu mtrr init function definitions to mtrr.h mtrr.h was included everywhere needed. Fixes the following sparse warnings. Also, the return types in the extern definitions were incorrect. arch/x86/kernel/cpu/mtrr/amd.c:113:12: warning: symbol 'amd_init_mtrr' was not declared. Should it be static? arch/x86/kernel/cpu/mtrr/cyrix.c:268:12: warning: symbol 'cyrix_init_mtrr' was not declared. Should it be static? arch/x86/kernel/cpu/mtrr/centaur.c:218:12: warning: symbol 'centaur_init_mtrr' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/mtrr/main.c | 4 ---- arch/x86/kernel/cpu/mtrr/mtrr.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 822d8f90c1eb..1e27b69a7a0e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -554,10 +554,6 @@ EXPORT_SYMBOL(mtrr_del); * These should be called implicitly, but we can't yet until all the initcall * stuff is done... */ -extern void amd_init_mtrr(void); -extern void cyrix_init_mtrr(void); -extern void centaur_init_mtrr(void); - static void __init init_ifs(void) { #ifndef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index fb74a2c20814..2cc77eb6fea3 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -97,3 +97,7 @@ void mtrr_state_warn(void); const char *mtrr_attrib_to_str(int x); void mtrr_wrmsr(unsigned, unsigned, unsigned); +/* CPU specific mtrr init functions */ +int amd_init_mtrr(void); +int cyrix_init_mtrr(void); +int centaur_init_mtrr(void); From cc0f21bbc12dc9f05b2e7f2469128f8717b2f4d3 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 45/78] x86: teach the static_protection function about high mappings Right now, enforcing that the high mapping of the kernel text doesn't get the NX bit is done deep in the guts of CPA, rather than in the static_protection() function that enforces all other per-arch sanity checks. This patch moves this sanity check into the central static_protection() function instead, and makes it apply ONLY to the kernel text, not to all other areas in the high mapping. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 877b5cca2cb8..bf5e33f6a322 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -106,6 +106,22 @@ static void cpa_flush_range(unsigned long start, int numpages) } } +#define HIGH_MAP_START __START_KERNEL_map +#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) + + +/* + * Converts a virtual address to a X86-64 highmap address + */ +static unsigned long virt_to_highmap(void *address) +{ +#ifdef CONFIG_X86_64 + return __pa((unsigned long)address) + HIGH_MAP_START - phys_base; +#else + return (unsigned long)address; +#endif +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -129,12 +145,24 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) */ if (within(address, (unsigned long)_text, (unsigned long)_etext)) pgprot_val(forbidden) |= _PAGE_NX; + /* + * Do the same for the x86-64 high kernel mapping + */ + if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) + pgprot_val(forbidden) |= _PAGE_NX; + #ifdef CONFIG_DEBUG_RODATA /* The .rodata section needs to be read-only */ if (within(address, (unsigned long)__start_rodata, (unsigned long)__end_rodata)) pgprot_val(forbidden) |= _PAGE_RW; + /* + * Do the same for the x86-64 high kernel mapping + */ + if (within(address, virt_to_highmap(__start_rodata), + virt_to_highmap(__end_rodata))) + pgprot_val(forbidden) |= _PAGE_RW; #endif prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); @@ -304,8 +332,6 @@ repeat: * Modules and drivers should use the set_memory_* APIs instead. */ -#define HIGH_MAP_START __START_KERNEL_map -#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) static int change_page_attr_addr(unsigned long address, pgprot_t mask_set, @@ -338,10 +364,11 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, /* * Calc the high mapping address. See __phys_addr() * for the non obvious details. + * + * Note that NX and other required permissions are + * checked in static_protections(). */ address = phys_addr + HIGH_MAP_START - phys_base; - /* Make sure the kernel mappings stay executable */ - pgprot_val(mask_clr) |= _PAGE_NX; /* * Our high aliases are imprecise, because we check From 626c2c9d065da0cbd9997e112501487958fde690 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 46/78] x86: use the pfn from the page when change its attributes When changing the attributes of a pte, we should use the PFN from the existing PTE rather than going through hoops calculating what we think it might have been; this is both fragile and totally unneeded. It also makes it more hairy to call any of these functions on non-direct maps for no good reason whatsover. With this change, __change_page_attr() no longer takes a pfn as argument, which simplifies all the callers. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bf5e33f6a322..6c55fbdbd7e8 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -277,17 +277,12 @@ out_unlock: } static int -__change_page_attr(unsigned long address, unsigned long pfn, - pgprot_t mask_set, pgprot_t mask_clr) +__change_page_attr(unsigned long address, pgprot_t mask_set, pgprot_t mask_clr) { struct page *kpte_page; int level, err = 0; pte_t *kpte; -#ifdef CONFIG_X86_32 - BUG_ON(pfn > max_low_pfn); -#endif - repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -298,17 +293,25 @@ repeat: BUG_ON(PageCompound(kpte_page)); if (level == PG_LEVEL_4K) { - pgprot_t new_prot = pte_pgprot(*kpte); pte_t new_pte, old_pte = *kpte; + pgprot_t new_prot = pte_pgprot(old_pte); + + if(!pte_val(old_pte)) { + WARN_ON_ONCE(1); + return -EINVAL; + } pgprot_val(new_prot) &= ~pgprot_val(mask_clr); pgprot_val(new_prot) |= pgprot_val(mask_set); new_prot = static_protections(new_prot, address); - new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); - BUG_ON(pte_pfn(new_pte) != pte_pfn(old_pte)); - + /* + * We need to keep the pfn from the existing PTE, + * after all we're only going to change it's attributes + * not the memory it points to + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); set_pte_atomic(kpte, new_pte); } else { err = split_large_page(kpte, address); @@ -337,11 +340,11 @@ static int change_page_attr_addr(unsigned long address, pgprot_t mask_set, pgprot_t mask_clr) { - unsigned long phys_addr = __pa(address); - unsigned long pfn = phys_addr >> PAGE_SHIFT; int err; #ifdef CONFIG_X86_64 + unsigned long phys_addr = __pa(address); + /* * If we are inside the high mapped kernel range, then we * fixup the low mapping first. __va() returns the virtual @@ -351,7 +354,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, address = (unsigned long) __va(phys_addr); #endif - err = __change_page_attr(address, pfn, mask_set, mask_clr); + err = __change_page_attr(address, mask_set, mask_clr); if (err) return err; @@ -375,7 +378,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, * everything between 0 and KERNEL_TEXT_SIZE, so do * not propagate lookup failures back to users: */ - __change_page_attr(address, pfn, mask_set, mask_clr); + __change_page_attr(address, mask_set, mask_clr); } #endif return err; From 63c1dcf4bc9a26b1d8baa9a8c7cc1b2e1e694011 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 47/78] x86: CPA use the existing pfn in split as well When splitting large pages, we ge the pfn from the existing entry instead of calculating it ourself. This removes the last remaining range restriction of the cpa code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6c55fbdbd7e8..a629cea5e465 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -221,8 +221,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) { pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); gfp_t gfp_flags = GFP_KERNEL; - unsigned long flags; - unsigned long addr; + unsigned long flags, addr, pfn; pte_t *pbase, *tmp; struct page *base; unsigned int i, level; @@ -253,8 +252,12 @@ static int split_large_page(pte_t *kpte, unsigned long address) paravirt_alloc_pt(&init_mm, page_to_pfn(base)); #endif - for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) - set_pte(&pbase[i], pfn_pte(addr >> PAGE_SHIFT, ref_prot)); + /* + * Get the target pfn from the original entry: + */ + pfn = pte_pfn(*kpte); + for (i = 0; i < PTRS_PER_PTE; i++, pfn++) + set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); /* * Install the new, split up pagetable. Important detail here: From e66aadbe6cb90813b3bbf07e3bc2a6aedcef7cd1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 48/78] x86: simplify __ioremap Remove tons of castings which make the code hard to read. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1a88d1572a77..2c3fa7189503 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -114,9 +114,8 @@ static int ioremap_change_attr(unsigned long paddr, unsigned long size, static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, enum ioremap_mode mode) { - void __iomem *addr; + unsigned long pfn, offset, last_addr, vaddr; struct vm_struct *area; - unsigned long pfn, offset, last_addr; pgprot_t prot; /* Don't allow wraparound or zero size */ @@ -164,19 +163,18 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, if (!area) return NULL; area->phys_addr = phys_addr; - addr = (void __iomem *) area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, prot)) { - remove_vm_area((void *)(PAGE_MASK & (unsigned long) addr)); + vaddr = (unsigned long) area->addr; + if (ioremap_page_range(vaddr, vaddr + size, phys_addr, prot)) { + remove_vm_area((void *)(vaddr & PAGE_MASK)); return NULL; } if (ioremap_change_attr(phys_addr, size, mode) < 0) { - vunmap(addr); + vunmap(area->addr); return NULL; } - return (void __iomem *) (offset + (char __iomem *)addr); + return (void __iomem *) (vaddr + offset); } /** From 75ab43bfce51085ffd627c470f48ae49ba6e6da3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 49/78] x86: ioremap remove the range check of cpa Now that cpa works on non-direct mappings as well, we can safely remove the range check in ioremap_change_attr(). Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2c3fa7189503..4e21231a5ce2 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -70,25 +70,12 @@ int page_is_ram(unsigned long pagenr) * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. */ -static int ioremap_change_attr(unsigned long paddr, unsigned long size, +static int ioremap_change_attr(unsigned long vaddr, unsigned long size, enum ioremap_mode mode) { - unsigned long vaddr = (unsigned long)__va(paddr); unsigned long nrpages = size >> PAGE_SHIFT; - unsigned int level; int err; - /* No change for pages after the last mapping */ - if ((paddr + size - 1) >= (max_pfn_mapped << PAGE_SHIFT)) - return 0; - - /* - * If there is no identity map for this address, - * change_page_attr_addr is unnecessary - */ - if (!lookup_address(vaddr, &level)) - return 0; - switch (mode) { case IOR_MODE_UNCACHED: default: @@ -169,7 +156,7 @@ static void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, return NULL; } - if (ioremap_change_attr(phys_addr, size, mode) < 0) { + if (ioremap_change_attr(vaddr, size, mode) < 0) { vunmap(area->addr); return NULL; } From f56d005d30342a45d8af2b75ecccc82200f09600 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:05 +0100 Subject: [PATCH 50/78] x86: no CPA on iounmap When an ioremap is unmapped, do not change the page attributes. There might be another mapping of the same physical address. PAT might detect a conflicting mapping attribute for no good reason. The mapping is removed anyway. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 4e21231a5ce2..ee6648fe6b15 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -240,9 +240,6 @@ void iounmap(volatile void __iomem *addr) return; } - /* Reset the direct mapping. Can block */ - ioremap_change_attr(p->phys_addr, p->size, IOR_MODE_CACHED); - /* Finally remove it */ o = remove_vm_area((void *)addr); BUG_ON(p != o || o == NULL); From 1c083eb2cbdd917149f6acaa55efca129d05c2a9 Mon Sep 17 00:00:00 2001 From: "Huang, Ying" Date: Mon, 4 Feb 2008 16:48:06 +0100 Subject: [PATCH 51/78] x86: fix EFI mapping The patch updates EFI runtime memory mapping code, by making EFI areas explicitly executable. Signed-off-by: Huang Ying Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/efi.c | 57 +++++++++++++++++++++------------------- arch/x86/kernel/efi_64.c | 22 ++++++++-------- include/asm-x86/efi.h | 4 +-- 3 files changed, 43 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 1411324a625c..32dd62b36ff7 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -379,11 +379,9 @@ void __init efi_init(void) #endif } -#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; - unsigned long end; void *p; if (!(__supported_pte_mask & _PAGE_NX)) @@ -392,18 +390,13 @@ static void __init runtime_code_page_mkexec(void) /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - if (md->type == EFI_RUNTIME_SERVICES_CODE && - (end >> PAGE_SHIFT) <= max_pfn_mapped) { - set_memory_x(md->virt_addr, md->num_pages); - set_memory_uc(md->virt_addr, md->num_pages); - } + + if (md->type != EFI_RUNTIME_SERVICES_CODE) + continue; + + set_memory_x(md->virt_addr, md->num_pages << EFI_PAGE_SHIFT); } - __flush_tlb_all(); } -#else -static inline void __init runtime_code_page_mkexec(void) { } -#endif /* * This function will switch the EFI runtime services to virtual mode. @@ -417,30 +410,40 @@ void __init efi_enter_virtual_mode(void) { efi_memory_desc_t *md; efi_status_t status; - unsigned long end; - void *p; + unsigned long size; + u64 end, systab; + void *p, *va; efi.systab = NULL; for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; if (!(md->attribute & EFI_MEMORY_RUNTIME)) continue; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - if ((md->attribute & EFI_MEMORY_WB) && - ((end >> PAGE_SHIFT) <= max_pfn_mapped)) - md->virt_addr = (unsigned long)__va(md->phys_addr); + + size = md->num_pages << EFI_PAGE_SHIFT; + end = md->phys_addr + size; + + if ((end >> PAGE_SHIFT) <= max_pfn_mapped) + va = __va(md->phys_addr); else - md->virt_addr = (unsigned long) - efi_ioremap(md->phys_addr, - md->num_pages << EFI_PAGE_SHIFT); - if (!md->virt_addr) + va = efi_ioremap(md->phys_addr, size); + + if (md->attribute & EFI_MEMORY_WB) + set_memory_uc(md->virt_addr, size); + + md->virt_addr = (u64) (unsigned long) va; + + if (!va) { printk(KERN_ERR PFX "ioremap of 0x%llX failed!\n", (unsigned long long)md->phys_addr); - if ((md->phys_addr <= (unsigned long)efi_phys.systab) && - ((unsigned long)efi_phys.systab < end)) - efi.systab = (efi_system_table_t *)(unsigned long) - (md->virt_addr - md->phys_addr + - (unsigned long)efi_phys.systab); + continue; + } + + systab = (u64) (unsigned long) efi_phys.systab; + if (md->phys_addr <= systab && systab < end) { + systab += md->virt_addr - md->phys_addr; + efi.systab = (efi_system_table_t *) (unsigned long) systab; + } } BUG_ON(!efi.systab); diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/kernel/efi_64.c index 674f2379480f..09d5c2330934 100644 --- a/arch/x86/kernel/efi_64.c +++ b/arch/x86/kernel/efi_64.c @@ -54,10 +54,10 @@ static void __init early_mapping_set_exec(unsigned long start, else set_pte(kpte, __pte((pte_val(*kpte) | _PAGE_NX) & \ __supported_pte_mask)); - if (level == 4) - start = (start + PMD_SIZE) & PMD_MASK; - else + if (level == PG_LEVEL_4K) start = (start + PAGE_SIZE) & PAGE_MASK; + else + start = (start + PMD_SIZE) & PMD_MASK; } } @@ -109,23 +109,23 @@ void __init efi_reserve_bootmem(void) memmap.nr_map * memmap.desc_size); } -void __iomem * __init efi_ioremap(unsigned long offset, - unsigned long size) +void __iomem * __init efi_ioremap(unsigned long phys_addr, unsigned long size) { static unsigned pages_mapped; - unsigned long last_addr; unsigned i, pages; - last_addr = offset + size - 1; - offset &= PAGE_MASK; - pages = (PAGE_ALIGN(last_addr) - offset) >> PAGE_SHIFT; + /* phys_addr and size must be page aligned */ + if ((phys_addr & ~PAGE_MASK) || (size & ~PAGE_MASK)) + return NULL; + + pages = size >> PAGE_SHIFT; if (pages_mapped + pages > MAX_EFI_IO_PAGES) return NULL; for (i = 0; i < pages; i++) { __set_fixmap(FIX_EFI_IO_MAP_FIRST_PAGE - pages_mapped, - offset, PAGE_KERNEL_EXEC_NOCACHE); - offset += PAGE_SIZE; + phys_addr, PAGE_KERNEL); + phys_addr += PAGE_SIZE; pages_mapped++; } diff --git a/include/asm-x86/efi.h b/include/asm-x86/efi.h index 9c68a1f098d8..ea9734b74aca 100644 --- a/include/asm-x86/efi.h +++ b/include/asm-x86/efi.h @@ -33,7 +33,7 @@ extern unsigned long asmlinkage efi_call_phys(void *, ...); #define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \ efi_call_virt(f, a1, a2, a3, a4, a5, a6) -#define efi_ioremap(addr, size) ioremap(addr, size) +#define efi_ioremap(addr, size) ioremap_cache(addr, size) #else /* !CONFIG_X86_32 */ @@ -86,7 +86,7 @@ extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3, efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \ (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6)) -extern void *efi_ioremap(unsigned long offset, unsigned long size); +extern void *efi_ioremap(unsigned long addr, unsigned long size); #endif /* CONFIG_X86_32 */ From 331e406588dc90331753e6562e5e3757bb907eb8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:06 +0100 Subject: [PATCH 52/78] x86: CPA return early when requested feature is not available Mask out the not supported bits (e.g. NX). If the clr/set masks are empty after the mask return without changing anything. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a629cea5e465..f60b93dc2e57 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -405,8 +405,18 @@ static int __change_page_attr_set_clr(unsigned long addr, int numpages, static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr) { - int ret = __change_page_attr_set_clr(addr, numpages, mask_set, - mask_clr); + int ret; + + /* + * Check, if we are requested to change a not supported + * feature: + */ + mask_set = canon_pgprot(mask_set); + mask_clr = canon_pgprot(mask_clr); + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + return 0; + + ret = __change_page_attr_set_clr(addr, numpages, mask_set, mask_clr); /* * On success we use clflush, when the CPU supports it to From 9bf5a47572fe4ea4e5ed2691e4313ea0bb68a74e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:06 +0100 Subject: [PATCH 53/78] x86: cpa, add the PAT bit defines Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/pgtable.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h index cd2524f07452..44c0a4f1b1eb 100644 --- a/include/asm-x86/pgtable.h +++ b/include/asm-x86/pgtable.h @@ -13,10 +13,12 @@ #define _PAGE_BIT_DIRTY 6 #define _PAGE_BIT_FILE 6 #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ +#define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ #define _PAGE_BIT_UNUSED1 9 /* available for programmer */ #define _PAGE_BIT_UNUSED2 10 #define _PAGE_BIT_UNUSED3 11 +#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* @@ -36,6 +38,8 @@ #define _PAGE_UNUSED1 (_AC(1, L)<<_PAGE_BIT_UNUSED1) #define _PAGE_UNUSED2 (_AC(1, L)<<_PAGE_BIT_UNUSED2) #define _PAGE_UNUSED3 (_AC(1, L)<<_PAGE_BIT_UNUSED3) +#define _PAGE_PAT (_AC(1, L)<<_PAGE_BIT_PAT) +#define _PAGE_PAT_LARGE (_AC(1, L)<<_PAGE_BIT_PAT_LARGE) #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) #define _PAGE_NX (_AC(1, ULL) << _PAGE_BIT_NX) From 6bb8383bebc02dae08a17f561401f58005f75c03 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:06 +0100 Subject: [PATCH 54/78] x86: cpa, only flush the cache if the caching attributes have changed We only need to flush the caches in cpa() if the the caching attributes have changed. Otherwise only flush the TLBs. This checks the PAT bits too although they are currently not used by the kernel. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f60b93dc2e57..456ad0ab9c7e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -52,21 +52,23 @@ void clflush_cache_range(void *vaddr, unsigned int size) static void __cpa_flush_all(void *arg) { + unsigned long cache = (unsigned long)arg; + /* * Flush all to work around Errata in early athlons regarding * large page flushing. */ __flush_tlb_all(); - if (boot_cpu_data.x86_model >= 4) + if (cache && boot_cpu_data.x86_model >= 4) wbinvd(); } -static void cpa_flush_all(void) +static void cpa_flush_all(unsigned long cache) { BUG_ON(irqs_disabled()); - on_each_cpu(__cpa_flush_all, NULL, 1, 1); + on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1); } static void __cpa_flush_range(void *arg) @@ -79,7 +81,7 @@ static void __cpa_flush_range(void *arg) __flush_tlb_all(); } -static void cpa_flush_range(unsigned long start, int numpages) +static void cpa_flush_range(unsigned long start, int numpages, int cache) { unsigned int i, level; unsigned long addr; @@ -89,6 +91,9 @@ static void cpa_flush_range(unsigned long start, int numpages) on_each_cpu(__cpa_flush_range, NULL, 1, 1); + if (!cache) + return; + /* * We only need to flush on one CPU, * clflush is a MESI-coherent instruction that @@ -402,10 +407,16 @@ static int __change_page_attr_set_clr(unsigned long addr, int numpages, return 0; } +static inline int cache_attr(pgprot_t attr) +{ + return pgprot_val(attr) & + (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); +} + static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr) { - int ret; + int ret, cache; /* * Check, if we are requested to change a not supported @@ -418,6 +429,12 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, ret = __change_page_attr_set_clr(addr, numpages, mask_set, mask_clr); + /* + * No need to flush, when we did not set any of the caching + * attributes: + */ + cache = cache_attr(mask_set); + /* * On success we use clflush, when the CPU supports it to * avoid the wbindv. If the CPU does not support it and in the @@ -425,9 +442,9 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * wbindv): */ if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages); + cpa_flush_range(addr, numpages, cache); else - cpa_flush_all(); + cpa_flush_all(cache); return ret; } From 72e458dfa63b3db7a46f66b0eb19e9ff4e17fc0e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:07 +0100 Subject: [PATCH 55/78] x86: introduce struct cpa_data The number of arguments which need to be transported is increasing and we want to add flush optimizations and large page preserving. Create struct cpa data and pass a pointer instead of increasing the number of arguments further. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 75 +++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 456ad0ab9c7e..d1c08308ecbb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -16,6 +16,13 @@ #include #include +struct cpa_data { + unsigned long vaddr; + int numpages; + pgprot_t mask_set; + pgprot_t mask_clr; +}; + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -284,8 +291,7 @@ out_unlock: return 0; } -static int -__change_page_attr(unsigned long address, pgprot_t mask_set, pgprot_t mask_clr) +static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { struct page *kpte_page; int level, err = 0; @@ -305,12 +311,15 @@ repeat: pgprot_t new_prot = pte_pgprot(old_pte); if(!pte_val(old_pte)) { - WARN_ON_ONCE(1); + printk(KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + cpa->vaddr); + WARN_ON(1); return -EINVAL; } - pgprot_val(new_prot) &= ~pgprot_val(mask_clr); - pgprot_val(new_prot) |= pgprot_val(mask_set); + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); new_prot = static_protections(new_prot, address); @@ -343,12 +352,10 @@ repeat: * Modules and drivers should use the set_memory_* APIs instead. */ - -static int -change_page_attr_addr(unsigned long address, pgprot_t mask_set, - pgprot_t mask_clr) +static int change_page_attr_addr(struct cpa_data *cpa) { int err; + unsigned long address = cpa->vaddr; #ifdef CONFIG_X86_64 unsigned long phys_addr = __pa(address); @@ -362,7 +369,7 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, address = (unsigned long) __va(phys_addr); #endif - err = __change_page_attr(address, mask_set, mask_clr); + err = __change_page_attr(address, cpa); if (err) return err; @@ -386,20 +393,19 @@ change_page_attr_addr(unsigned long address, pgprot_t mask_set, * everything between 0 and KERNEL_TEXT_SIZE, so do * not propagate lookup failures back to users: */ - __change_page_attr(address, mask_set, mask_clr); + __change_page_attr(address, cpa); } #endif return err; } -static int __change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) +static int __change_page_attr_set_clr(struct cpa_data *cpa) { unsigned int i; int ret; - for (i = 0; i < numpages ; i++, addr += PAGE_SIZE) { - ret = change_page_attr_addr(addr, mask_set, mask_clr); + for (i = 0; i < cpa->numpages ; i++, cpa->vaddr += PAGE_SIZE) { + ret = change_page_attr_addr(cpa); if (ret) return ret; } @@ -416,6 +422,7 @@ static inline int cache_attr(pgprot_t attr) static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr) { + struct cpa_data cpa; int ret, cache; /* @@ -427,7 +434,12 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) return 0; - ret = __change_page_attr_set_clr(addr, numpages, mask_set, mask_clr); + cpa.vaddr = addr; + cpa.numpages = numpages; + cpa.mask_set = mask_set; + cpa.mask_clr = mask_clr; + + ret = __change_page_attr_set_clr(&cpa); /* * No need to flush, when we did not set any of the caching @@ -548,37 +560,26 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } - -#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_CPA_DEBUG) -static inline int __change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) -{ - return __change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); -} - -static inline int __change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) -{ - return __change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); -} -#endif - #ifdef CONFIG_DEBUG_PAGEALLOC static int __set_pages_p(struct page *page, int numpages) { - unsigned long addr = (unsigned long)page_address(page); + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .mask_clr = __pgprot(0)}; - return __change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PRESENT | _PAGE_RW)); + return __change_page_attr_set_clr(&cpa); } static int __set_pages_np(struct page *page, int numpages) { - unsigned long addr = (unsigned long)page_address(page); + struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + .numpages = numpages, + .mask_set = __pgprot(0), + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; - return __change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PRESENT)); + return __change_page_attr_set_clr(&cpa); } void kernel_map_pages(struct page *page, int numpages, int enable) From f4ae5da0e8e92caa168e7c2a7c4a6c4064b082c2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:07 +0100 Subject: [PATCH 56/78] x86: cpa, check if we changed anything and tlb flushing is necessary Flush tlbs only when there was a real change. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index d1c08308ecbb..79a9f1b42ddd 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -21,6 +21,7 @@ struct cpa_data { int numpages; pgprot_t mask_set; pgprot_t mask_clr; + int flushtlb; }; static inline int @@ -329,11 +330,19 @@ repeat: * not the memory it points to */ new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); - set_pte_atomic(kpte, new_pte); + + /* + * Do we really change anything ? + */ + if (pte_val(old_pte) != pte_val(new_pte)) { + set_pte_atomic(kpte, new_pte); + cpa->flushtlb = 1; + } } else { err = split_large_page(kpte, address); if (!err) goto repeat; + cpa->flushtlb = 1; } return err; } @@ -438,9 +447,16 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; + cpa.flushtlb = 0; ret = __change_page_attr_set_clr(&cpa); + /* + * Check whether we really changed something: + */ + if (!cpa.flushtlb) + return ret; + /* * No need to flush, when we did not set any of the caching * attributes: From 65e074dffa198978ab0c9976a19b954fbe1183e2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:07 +0100 Subject: [PATCH 57/78] x86: cpa, preserve large pages if possible When CPA is called on a range which fits into a large page mapping, avoid to split the page when: 1) There is no change of attributes 2) The range to change is a complete large mapping Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 142 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 130 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 79a9f1b42ddd..40b7ac58e671 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -18,12 +18,17 @@ struct cpa_data { unsigned long vaddr; - int numpages; pgprot_t mask_set; pgprot_t mask_clr; + int numpages; int flushtlb; }; +enum { + CPA_NO_SPLIT = 0, + CPA_SPLIT, +}; + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -230,6 +235,86 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) #endif } +static int try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) +{ + unsigned long nextpage_addr, numpages, pmask, psize, flags; + pte_t new_pte, old_pte, *tmp; + pgprot_t old_prot, new_prot; + int level, res = CPA_SPLIT; + + spin_lock_irqsave(&pgd_lock, flags); + /* + * Check for races, another CPU might have split this page + * up already: + */ + tmp = lookup_address(address, &level); + if (tmp != kpte) + goto out_unlock; + + switch (level) { + case PG_LEVEL_2M: + psize = LARGE_PAGE_SIZE; + pmask = LARGE_PAGE_MASK; + break; + case PG_LEVEL_1G: + default: + res = -EINVAL; + goto out_unlock; + } + + /* + * Calculate the number of pages, which fit into this large + * page starting at address: + */ + nextpage_addr = (address + psize) & pmask; + numpages = (nextpage_addr - address) >> PAGE_SHIFT; + if (numpages < cpa->numpages) + cpa->numpages = numpages; + + /* + * We are safe now. Check whether the new pgprot is the same: + */ + old_pte = *kpte; + old_prot = new_prot = pte_pgprot(old_pte); + + pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); + pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); + new_prot = static_protections(new_prot, address); + + /* + * If there are no changes, return. maxpages has been updated + * above: + */ + if (pgprot_val(new_prot) == pgprot_val(old_prot)) { + res = CPA_NO_SPLIT; + goto out_unlock; + } + + /* + * We need to change the attributes. Check, whether we can + * change the large page in one go. We request a split, when + * the address is not aligned and the number of pages is + * smaller than the number of pages in the large page. Note + * that we limited the number of possible pages already to + * the number of pages in the large page. + */ + if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { + /* + * The address is aligned and the number of pages + * covers the full page. + */ + new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + __set_pmd_pte(kpte, address, new_pte); + cpa->flushtlb = 1; + res = CPA_NO_SPLIT; + } + +out_unlock: + spin_unlock_irqrestore(&pgd_lock, flags); + return res; +} + static int split_large_page(pte_t *kpte, unsigned long address) { pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); @@ -295,7 +380,7 @@ out_unlock: static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { struct page *kpte_page; - int level, err = 0; + int level, res; pte_t *kpte; repeat: @@ -338,13 +423,34 @@ repeat: set_pte_atomic(kpte, new_pte); cpa->flushtlb = 1; } - } else { - err = split_large_page(kpte, address); - if (!err) - goto repeat; - cpa->flushtlb = 1; + cpa->numpages = 1; + return 0; } - return err; + + /* + * Check, whether we can keep the large page intact + * and just change the pte: + */ + res = try_preserve_large_page(kpte, address, cpa); + if (res < 0) + return res; + + /* + * When the range fits into the existing large page, + * return. cp->numpages and cpa->tlbflush have been updated in + * try_large_page: + */ + if (res == CPA_NO_SPLIT) + return 0; + + /* + * We have to split the large page: + */ + res = split_large_page(kpte, address); + if (res) + return res; + cpa->flushtlb = 1; + goto repeat; } /** @@ -410,15 +516,27 @@ static int change_page_attr_addr(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa) { - unsigned int i; - int ret; + int ret, numpages = cpa->numpages; - for (i = 0; i < cpa->numpages ; i++, cpa->vaddr += PAGE_SIZE) { + while (numpages) { + /* + * Store the remaining nr of pages for the large page + * preservation check. + */ + cpa->numpages = numpages; ret = change_page_attr_addr(cpa); if (ret) return ret; - } + /* + * Adjust the number of pages with the result of the + * CPA operation. Either a large page has been + * preserved or a single page update happened. + */ + BUG_ON(cpa->numpages > numpages); + numpages -= cpa->numpages; + cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } From 34508f66b69ff1708192654f631eb8f1d4c52005 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:48:07 +0100 Subject: [PATCH 58/78] x86: AMD Athlon X2 hard hang fix An Athlon 64 X2 test system showed hard hangs shortly after marking the kernel text read-only, if we tried to preserve largepages and changed the PSE entry from RW to RO. The pagetable code itself is correct, it's the CPU that locked up hard (and not even the NMI watchdog could punch through that hard hang). So be conservative and always do splitups - like we did in the past. Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 40b7ac58e671..3810f7a83b1d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -243,6 +243,17 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_t old_prot, new_prot; int level, res = CPA_SPLIT; + /* + * An Athlon 64 X2 showed hard hangs if we tried to preserve + * largepages and changed the PSE entry from RW to RO. + * + * As AMD CPUs have a long series of erratas in this area, + * (and none of the known ones seem to explain this hang), + * disable this code until the hang can be debugged: + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return res; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page From 9a14aefc1d28c6037122965ee8c10d92a970ade0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:07 +0100 Subject: [PATCH 59/78] x86: cpa, fix lookup_address lookup_address() returns a wrong level and a wrong pointer to a non existing pte, when pmd or pud entries are marked !present. This happens for example due to boot time mapping of GART into the low memory space. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3810f7a83b1d..7d21cd658ed3 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -188,6 +188,14 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) return prot; } +/* + * Lookup the page table entry for a virtual address. Return a pointer + * to the entry and the level of the mapping. + * + * Note: We return pud and pmd either when the entry is marked large + * or when the present bit is not set. Otherwise we would return a + * pointer to a nonexisting mapping. + */ pte_t *lookup_address(unsigned long address, int *level) { pgd_t *pgd = pgd_offset_k(address); @@ -206,7 +214,7 @@ pte_t *lookup_address(unsigned long address, int *level) return NULL; *level = PG_LEVEL_2M; - if (pmd_large(*pmd)) + if (pmd_large(*pmd) || !pmd_present(*pmd)) return (pte_t *)pmd; *level = PG_LEVEL_4K; From 31422c51e0dc72532d82e80895932d430c3ed307 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 60/78] x86: rename LARGE_PAGE_SIZE to PMD_PAGE_SIZE Fix up all users. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/boot/compressed/head_64.S | 8 ++++---- arch/x86/kernel/head_64.S | 4 ++-- arch/x86/kernel/pci-gart_64.c | 2 +- arch/x86/mm/init_64.c | 6 +++--- arch/x86/mm/pageattr.c | 6 +++--- include/asm-x86/page.h | 4 ++-- 6 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 1ccb38a7f0d2..e8657b98c902 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -80,8 +80,8 @@ startup_32: #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(LARGE_PAGE_SIZE -1), %ebx - andl $LARGE_PAGE_MASK, %ebx + addl $(PMD_PAGE_SIZE -1), %ebx + andl $PMD_PAGE_MASK, %ebx #else movl $CONFIG_PHYSICAL_START, %ebx #endif @@ -220,8 +220,8 @@ ENTRY(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(LARGE_PAGE_SIZE - 1), %rbp - andq $LARGE_PAGE_MASK, %rbp + addq $(PMD_PAGE_SIZE - 1), %rbp + andq $PMD_PAGE_MASK, %rbp movq %rbp, %rbx #else movq $CONFIG_PHYSICAL_START, %rbp diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d5a7a361200..4f283ad215ec 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -63,7 +63,7 @@ startup_64: /* Is the address not 2M aligned? */ movq %rbp, %rax - andl $~LARGE_PAGE_MASK, %eax + andl $~PMD_PAGE_MASK, %eax testl %eax, %eax jnz bad_address @@ -88,7 +88,7 @@ startup_64: /* Add an Identity mapping if I am above 1G */ leaq _text(%rip), %rdi - andq $LARGE_PAGE_MASK, %rdi + andq $PMD_PAGE_MASK, %rdi movq %rdi, %rax shrq $PUD_SHIFT, %rax diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 4d5cc7181982..ae1d3d8b384d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -501,7 +501,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) } a = aper + iommu_size; - iommu_size -= round_up(a, LARGE_PAGE_SIZE) - a; + iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; if (iommu_size < 64*1024*1024) { printk(KERN_WARNING diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index eabcaed76c28..b7a7992c28b6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -444,10 +444,10 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) { unsigned long end = address + size; - BUG_ON(address & ~LARGE_PAGE_MASK); - BUG_ON(size & ~LARGE_PAGE_MASK); + BUG_ON(address & ~PMD_PAGE_MASK); + BUG_ON(size & ~PMD_PAGE_MASK); - for (; address < end; address += LARGE_PAGE_SIZE) { + for (; address < end; address += PMD_PAGE_SIZE) { pgd_t *pgd = pgd_offset_k(address); pud_t *pud; pmd_t *pmd; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7d21cd658ed3..74446ea23ffb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -273,8 +273,8 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: - psize = LARGE_PAGE_SIZE; - pmask = LARGE_PAGE_MASK; + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; break; case PG_LEVEL_1G: default: @@ -363,7 +363,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) } address = __pa(address); - addr = address & LARGE_PAGE_MASK; + addr = address & PMD_PAGE_MASK; pbase = (pte_t *)page_address(base); #ifdef CONFIG_X86_32 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); diff --git a/include/asm-x86/page.h b/include/asm-x86/page.h index c8b30efeed85..1cb7c51bc296 100644 --- a/include/asm-x86/page.h +++ b/include/asm-x86/page.h @@ -13,8 +13,8 @@ #define PHYSICAL_PAGE_MASK (PAGE_MASK & __PHYSICAL_MASK) #define PTE_MASK (_AT(long, PHYSICAL_PAGE_MASK)) -#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT) -#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) +#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT) +#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1)) #define HPAGE_SHIFT PMD_SHIFT #define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) From 07cf89c05f2bbafa002401ac4e09ac31678513e4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 61/78] x86: CPA fix pagetable split Move the readout of the large entry into the spinlock section to prevent an unlikely but possible race. Mark the pmd/pud entry present after the split. We preserved the non present bit in the new split mapping. Remove the stale gfp_flags double initialization. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 74446ea23ffb..72880993af89 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -336,7 +336,7 @@ out_unlock: static int split_large_page(pte_t *kpte, unsigned long address) { - pgprot_t ref_prot = pte_pgprot(pte_clrhuge(*kpte)); + pgprot_t ref_prot; gfp_t gfp_flags = GFP_KERNEL; unsigned long flags, addr, pfn; pte_t *pbase, *tmp; @@ -344,7 +344,6 @@ static int split_large_page(pte_t *kpte, unsigned long address) unsigned int i, level; #ifdef CONFIG_DEBUG_PAGEALLOC - gfp_flags = __GFP_HIGH | __GFP_NOFAIL | __GFP_NOWARN; gfp_flags = GFP_ATOMIC | __GFP_NOWARN; #endif base = alloc_pages(gfp_flags, 0); @@ -368,6 +367,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) #ifdef CONFIG_X86_32 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); #endif + ref_prot = pte_pgprot(pte_clrhuge(*kpte)); /* * Get the target pfn from the original entry: @@ -377,13 +377,17 @@ static int split_large_page(pte_t *kpte, unsigned long address) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); /* - * Install the new, split up pagetable. Important detail here: + * Install the new, split up pagetable. Important details here: * * On Intel the NX bit of all levels must be cleared to make a * page executable. See section 4.13.2 of Intel 64 and IA-32 * Architectures Software Developer's Manual). + * + * Mark the entry present. The current mapping might be + * set to not present, which we preserved above. */ ref_prot = pte_pgprot(pte_mkexec(pte_clrhuge(*kpte))); + pgprot_val(ref_prot) |= _PAGE_PRESENT; __set_pmd_pte(kpte, address, mk_pte(base, ref_prot)); base = NULL; From 64f351d197d9ae8ad9624998afa8ee18e696ca44 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 62/78] x86: cpa selftest, skip non present entries pud and pmd entries in the RAM area might be marked as non present. Do not try to modify them in the selftest. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 7573e786d2f2..398f3a578dde 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -137,7 +137,8 @@ static __init int exercise_pageattr(void) for (k = 0; k < len[i]; k++) { pte = lookup_address(addr[i] + k*PAGE_SIZE, &level); - if (!pte || pgprot_val(pte_pgprot(*pte)) == 0) { + if (!pte || pgprot_val(pte_pgprot(*pte)) == 0 || + !(pte_val(*pte) & _PAGE_PRESENT)) { addr[i] = 0; break; } From 28d6ee41a6ff8139e442af2dc55928bfbb475586 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 63/78] x86: switch pci-gart over to using set_memory_np() instead of clear_kernel_mapping() pci-gart needs to unmap the IOMMU aperture to prevent cache corruptions. Switch this over to using set_memory_np() instead of clear_kernel_mapping(). Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/pci-gart_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ae1d3d8b384d..845cbecd68e9 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -731,7 +731,8 @@ void __init gart_iommu_init(void) * the backing memory. The GART address is only used by PCI * devices. */ - clear_kernel_mapping((unsigned long)__va(iommu_bus_base), iommu_size); + set_memory_np((unsigned long)__va(iommu_bus_base), + iommu_size >> PAGE_SHIFT); /* * Try to workaround a bug (thanks to BenH) From bde1965ce8c63e17cc284e1af616c85aba483f11 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 64/78] x86: remove now unused clear_kernel_mapping Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 43 ------------------------------------ include/asm-x86/pgtable_64.h | 1 - 2 files changed, 44 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b7a7992c28b6..5855449ce7aa 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -434,49 +434,6 @@ void __init paging_init(void) } #endif -/* - * Unmap a kernel mapping if it exists. This is useful to avoid - * prefetches from the CPU leading to inconsistent cache lines. - * address and size must be aligned to 2MB boundaries. - * Does nothing when the mapping doesn't exist. - */ -void __init clear_kernel_mapping(unsigned long address, unsigned long size) -{ - unsigned long end = address + size; - - BUG_ON(address & ~PMD_PAGE_MASK); - BUG_ON(size & ~PMD_PAGE_MASK); - - for (; address < end; address += PMD_PAGE_SIZE) { - pgd_t *pgd = pgd_offset_k(address); - pud_t *pud; - pmd_t *pmd; - - if (pgd_none(*pgd)) - continue; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - continue; - - pmd = pmd_offset(pud, address); - if (!pmd || pmd_none(*pmd)) - continue; - - if (!(pmd_val(*pmd) & _PAGE_PSE)) { - /* - * Could handle this, but it should not happen - * currently: - */ - printk(KERN_ERR "clear_kernel_mapping: " - "mapping has been split. will leak memory\n"); - pmd_ERROR(*pmd); - } - set_pmd(pmd, __pmd(0)); - } - __flush_tlb_all(); -} - /* * Memory hotplug specific functions */ diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 6e615a103c2f..5c86cff3ee79 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -21,7 +21,6 @@ extern pgd_t init_level4_pgt[]; #define swapper_pg_dir init_level4_pgt extern void paging_init(void); -extern void clear_kernel_mapping(unsigned long addr, unsigned long size); #endif /* !__ASSEMBLY__ */ From 6ce9fc17d913ae51f8434d2826f306347820b07d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 65/78] x86: remove cpa warning this race is legit and can happen on SMP systems. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 72880993af89..0b029c97174e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -356,10 +356,8 @@ static int split_large_page(pte_t *kpte, unsigned long address) * up for us already: */ tmp = lookup_address(address, &level); - if (tmp != kpte) { - WARN_ON_ONCE(1); + if (tmp != kpte) goto out_unlock; - } address = __pa(address); addr = address & PMD_PAGE_MASK; From 7bfb72e847c201fe32271fb13f75d060671d8890 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:08 +0100 Subject: [PATCH 66/78] x86: fix page-present check in cpa_flush_range pte_present() might return true for PROT_NONE mappings. Explicitely check the present bit. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 0b029c97174e..9be684e61dcb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -119,7 +119,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) /* * Only flush present addresses: */ - if (pte && pte_present(*pte)) + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) clflush_cache_range((void *) addr, PAGE_SIZE); } } From d4f71f7969ee2c16e2969185280c13d4f51a9172 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 67/78] x86: switch direct mapping setup over to set_pte Use set_pte() for setting up the 2MB pages in the direct mapping. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/init_64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5855449ce7aa..3a98d6f724ab 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -273,7 +273,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) int i = pmd_index(address); for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) { - unsigned long entry; pmd_t *pmd = pmd_page + pmd_index(address); if (address >= end) { @@ -287,9 +286,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end) if (pmd_val(*pmd)) continue; - entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address; - entry &= __supported_pte_mask; - set_pmd(pmd, __pmd(entry)); + set_pte((pte_t *)pmd, + pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); } } From 019c3e7c5e93475002edfc0da6c59508247553b1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 68/78] x86: add feature macros for the gbpages cpuid bit Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/cpufeature.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 3adc9cf0f391..065e92966c7c 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -46,6 +46,7 @@ #define X86_FEATURE_MP (1*32+19) /* MP Capable. */ #define X86_FEATURE_NX (1*32+20) /* Execute Disable */ #define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_GBPAGES (1*32+26) /* GB pages */ #define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */ #define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */ #define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */ @@ -179,6 +180,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH) #define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) +#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 From fbff3c21aff29ffdfa46b50946696689d3e70a48 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 69/78] x86: add PUD_PAGE_SIZE a PUD entry covers 1GB of virtual memory. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/page_64.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/asm-x86/page_64.h b/include/asm-x86/page_64.h index c1ac42d8707f..dcf0c0746075 100644 --- a/include/asm-x86/page_64.h +++ b/include/asm-x86/page_64.h @@ -23,6 +23,9 @@ #define MCE_STACK 5 #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) +#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) + #define __PAGE_OFFSET _AC(0xffff810000000000, UL) #define __PHYSICAL_START CONFIG_PHYSICAL_START From 61e19a347ad4bcdda615ef77ef9c3e656e254f3d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 70/78] x86: add pgtable accessor functions for gbpages Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- include/asm-x86/pgtable_32.h | 2 ++ include/asm-x86/pgtable_64.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h index 21e70fbf1dae..935630d17304 100644 --- a/include/asm-x86/pgtable_32.h +++ b/include/asm-x86/pgtable_32.h @@ -148,6 +148,8 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) +static inline int pud_large(pud_t pud) { return 0; } + /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] * diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h index 5c86cff3ee79..bd4740a60f29 100644 --- a/include/asm-x86/pgtable_64.h +++ b/include/asm-x86/pgtable_64.h @@ -198,6 +198,12 @@ static inline unsigned long pmd_bad(pmd_t pmd) #define pud_offset(pgd, address) ((pud_t *) pgd_page_vaddr(*(pgd)) + pud_index(address)) #define pud_present(pud) (pud_val(pud) & _PAGE_PRESENT) +static inline int pud_large(pud_t pte) +{ + return (pud_val(pte) & (_PAGE_PSE|_PAGE_PRESENT)) == + (_PAGE_PSE|_PAGE_PRESENT); +} + /* PMD - Level 2 access */ #define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val(pmd) & PTE_MASK)) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) From c2f71ee2140b2a506735ff9fcb7e3b1dfaab8f2b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 71/78] x86: add gbpages support to lookup_address [ tglx@linutronix.de: fix bootup crash on sparse mappings. ] Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 9be684e61dcb..143fbafc948a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -209,6 +209,11 @@ pte_t *lookup_address(unsigned long address, int *level) pud = pud_offset(pgd, address); if (pud_none(*pud)) return NULL; + + *level = PG_LEVEL_1G; + if (pud_large(*pud) || !pud_present(*pud)) + return (pte_t *)pud; + pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) return NULL; From b5360222273cb3e57a119c18eef42f59da4da87b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 72/78] x86: support gbpages in pagetable dump Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3fff490254a9..ad8b9733d6b3 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -240,7 +240,8 @@ void dump_pagetable(unsigned long address) pud = pud_offset(pgd, address); if (bad_address(pud)) goto bad; printk("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud)) goto ret; + if (!pud_present(*pud) || pud_large(*pud)) + goto ret; pmd = pmd_offset(pud, address); if (bad_address(pmd)) goto bad; From f07333fd149eb6826da26a89c3aff90324f270b0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 73/78] x86: implement gbpages support in change_page_attr() Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 143fbafc948a..42ca3d8effad 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -281,7 +281,12 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, psize = PMD_PAGE_SIZE; pmask = PMD_PAGE_MASK; break; +#ifdef CONFIG_X86_64 case PG_LEVEL_1G: + psize = PMD_PAGE_SIZE; + pmask = PMD_PAGE_MASK; + break; +#endif default: res = -EINVAL; goto out_unlock; @@ -343,7 +348,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) { pgprot_t ref_prot; gfp_t gfp_flags = GFP_KERNEL; - unsigned long flags, addr, pfn; + unsigned long flags, addr, pfn, pfninc = 1; pte_t *pbase, *tmp; struct page *base; unsigned int i, level; @@ -372,11 +377,19 @@ static int split_large_page(pte_t *kpte, unsigned long address) #endif ref_prot = pte_pgprot(pte_clrhuge(*kpte)); +#ifdef CONFIG_X86_64 + if (level == PG_LEVEL_1G) { + pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; + pgprot_val(ref_prot) |= _PAGE_PSE; + addr &= PUD_PAGE_MASK; + } +#endif + /* * Get the target pfn from the original entry: */ pfn = pte_pfn(*kpte); - for (i = 0; i < PTRS_PER_PTE; i++, pfn++) + for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); /* From 9df84993cb3d71669894654ab257f01f6e4ed48e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 74/78] x86: cpa, cleanups Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 42ca3d8effad..029fb07b3f03 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -16,6 +16,9 @@ #include #include +/* + * The current flushing context - we pass it instead of 5 arguments: + */ struct cpa_data { unsigned long vaddr; pgprot_t mask_set; @@ -206,6 +209,7 @@ pte_t *lookup_address(unsigned long address, int *level) if (pgd_none(*pgd)) return NULL; + pud = pud_offset(pgd, address); if (pud_none(*pud)) return NULL; @@ -223,9 +227,13 @@ pte_t *lookup_address(unsigned long address, int *level) return (pte_t *)pmd; *level = PG_LEVEL_4K; + return pte_offset_kernel(pmd, address); } +/* + * Set the new pmd in all the pgds we know about: + */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) { /* change init_mm */ @@ -248,8 +256,9 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) #endif } -static int try_preserve_large_page(pte_t *kpte, unsigned long address, - struct cpa_data *cpa) +static int +try_preserve_large_page(pte_t *kpte, unsigned long address, + struct cpa_data *cpa) { unsigned long nextpage_addr, numpages, pmask, psize, flags; pte_t new_pte, old_pte, *tmp; @@ -341,17 +350,18 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, out_unlock: spin_unlock_irqrestore(&pgd_lock, flags); + return res; } static int split_large_page(pte_t *kpte, unsigned long address) { - pgprot_t ref_prot; - gfp_t gfp_flags = GFP_KERNEL; unsigned long flags, addr, pfn, pfninc = 1; - pte_t *pbase, *tmp; - struct page *base; + gfp_t gfp_flags = GFP_KERNEL; unsigned int i, level; + pte_t *pbase, *tmp; + pgprot_t ref_prot; + struct page *base; #ifdef CONFIG_DEBUG_PAGEALLOC gfp_flags = GFP_ATOMIC | __GFP_NOWARN; @@ -505,7 +515,6 @@ repeat: * * Modules and drivers should use the set_memory_* APIs instead. */ - static int change_page_attr_addr(struct cpa_data *cpa) { int err; From beaff6333b4a21e8f3b7f9a7c3c8f8716b2334bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:48:09 +0100 Subject: [PATCH 75/78] x86: cpa, eliminate CPA_ enum Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 029fb07b3f03..fb2eedba76ad 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -27,11 +27,6 @@ struct cpa_data { int flushtlb; }; -enum { - CPA_NO_SPLIT = 0, - CPA_SPLIT, -}; - static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -263,7 +258,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, unsigned long nextpage_addr, numpages, pmask, psize, flags; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; - int level, res = CPA_SPLIT; + int level, do_split = 1; /* * An Athlon 64 X2 showed hard hangs if we tried to preserve @@ -274,7 +269,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * disable this code until the hang can be debugged: */ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return res; + return 1; spin_lock_irqsave(&pgd_lock, flags); /* @@ -297,7 +292,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, break; #endif default: - res = -EINVAL; + do_split = -EINVAL; goto out_unlock; } @@ -325,7 +320,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * above: */ if (pgprot_val(new_prot) == pgprot_val(old_prot)) { - res = CPA_NO_SPLIT; + do_split = 0; goto out_unlock; } @@ -345,13 +340,13 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); __set_pmd_pte(kpte, address, new_pte); cpa->flushtlb = 1; - res = CPA_NO_SPLIT; + do_split = 0; } out_unlock: spin_unlock_irqrestore(&pgd_lock, flags); - return res; + return do_split; } static int split_large_page(pte_t *kpte, unsigned long address) @@ -429,7 +424,7 @@ out_unlock: static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { struct page *kpte_page; - int level, res; + int level, do_split; pte_t *kpte; repeat: @@ -480,25 +475,26 @@ repeat: * Check, whether we can keep the large page intact * and just change the pte: */ - res = try_preserve_large_page(kpte, address, cpa); - if (res < 0) - return res; + do_split = try_preserve_large_page(kpte, address, cpa); + if (do_split < 0) + return do_split; /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ - if (res == CPA_NO_SPLIT) + if (do_split == 0) return 0; /* * We have to split the large page: */ - res = split_large_page(kpte, address); - if (res) - return res; + do_split = split_large_page(kpte, address); + if (do_split) + return do_split; cpa->flushtlb = 1; + goto repeat; } From 87f7f8fe328388a1430a4c27cbe684f3925fd8a5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 Feb 2008 16:48:10 +0100 Subject: [PATCH 76/78] x86: cpa, clean up code flow Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index fb2eedba76ad..4f033505127e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -423,8 +423,8 @@ out_unlock: static int __change_page_attr(unsigned long address, struct cpa_data *cpa) { + int level, do_split, err; struct page *kpte_page; - int level, do_split; pte_t *kpte; repeat: @@ -476,26 +476,24 @@ repeat: * and just change the pte: */ do_split = try_preserve_large_page(kpte, address, cpa); - if (do_split < 0) - return do_split; - /* * When the range fits into the existing large page, * return. cp->numpages and cpa->tlbflush have been updated in * try_large_page: */ - if (do_split == 0) - return 0; + if (do_split <= 0) + return do_split; /* * We have to split the large page: */ - do_split = split_large_page(kpte, address); - if (do_split) - return do_split; - cpa->flushtlb = 1; + err = split_large_page(kpte, address); + if (!err) { + cpa->flushtlb = 1; + goto repeat; + } - goto repeat; + return err; } /** From 7b610eec7a06ede64f71459e7f412dfd96f4cc5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 4 Feb 2008 16:48:10 +0100 Subject: [PATCH 77/78] x86: cpa, micro-optimization Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4f033505127e..bb55a78dcd62 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -237,6 +237,7 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) if (!SHARED_KERNEL_PMD) { struct page *page; + address = __pa(address); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; pud_t *pud; @@ -351,7 +352,7 @@ out_unlock: static int split_large_page(pte_t *kpte, unsigned long address) { - unsigned long flags, addr, pfn, pfninc = 1; + unsigned long flags, pfn, pfninc = 1; gfp_t gfp_flags = GFP_KERNEL; unsigned int i, level; pte_t *pbase, *tmp; @@ -374,8 +375,6 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (tmp != kpte) goto out_unlock; - address = __pa(address); - addr = address & PMD_PAGE_MASK; pbase = (pte_t *)page_address(base); #ifdef CONFIG_X86_32 paravirt_alloc_pt(&init_mm, page_to_pfn(base)); @@ -386,7 +385,6 @@ static int split_large_page(pte_t *kpte, unsigned long address) if (level == PG_LEVEL_1G) { pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; pgprot_val(ref_prot) |= _PAGE_PSE; - addr &= PUD_PAGE_MASK; } #endif From 795d45b22c079946332bf3825afefe5a981a97b6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 4 Feb 2008 16:48:10 +0100 Subject: [PATCH 78/78] x86: fix RTC lockdep warning: potential hardirq recursion After disabling both CONFIG_DEBUG_LOCKING_API_SELFTESTS and netconsole (using current mainline) I get a login prompt, and also... [ 5.181668] SELinux: policy loaded with handle_unknown=deny [ 5.183315] type=1403 audit(1202100038.157:3): policy loaded auid=4294967295 ses=4294967295 [ 5.822073] SELinux: initialized (dev usbfs, type usbfs), uses genfs_contexts [ 7.819146] ------------[ cut here ]------------ [ 7.819146] WARNING: at kernel/lockdep.c:2033 trace_hardirqs_on+0x9b/0x10d() [ 7.819146] Modules linked in: generic ext3 jbd ide_disk ide_core [ 7.819146] Pid: 399, comm: hwclock Not tainted 2.6.24 #4 [ 7.819146] [] warn_on_slowpath+0x41/0x51 [ 7.819146] [] ? lock_release_holdtime+0x50/0x56 [ 7.819146] [] ? check_usage_forwards+0x19/0x3b [ 7.819146] [] ? __lock_acquire+0xac3/0xb0b [ 7.819146] [] ? native_sched_clock+0x8b/0x9f [ 7.819146] [] ? lock_release_holdtime+0x50/0x56 [ 7.819146] [] ? _spin_unlock_irq+0x22/0x42 [ 7.819146] [] trace_hardirqs_on+0x9b/0x10d [ 7.819146] [] _spin_unlock_irq+0x22/0x42 [ 7.819146] [] hpet_rtc_interrupt+0xdf/0x290 [ 7.819146] [] handle_IRQ_event+0x1a/0x46 [ 7.819146] [] handle_edge_irq+0xbe/0xff [ 7.819146] [] do_IRQ+0x6d/0x84 [ 7.819146] [] common_interrupt+0x2e/0x34 [ 7.819146] [] ? ktime_get_ts+0x8/0x3f [ 7.819146] [] ? lock_release+0x167/0x16f [ 7.819146] [] ? core_sys_select+0x2c/0x327 [ 7.819146] [] core_sys_select+0x74/0x327 [ 7.819146] [] ? native_sched_clock+0x8b/0x9f [ 7.819146] [] ? lock_release_holdtime+0x50/0x56 [ 7.819146] [] ? _spin_unlock_irq+0x22/0x42 [ 7.819146] [] ? trace_hardirqs_on+0xe6/0x10d [ 7.819146] [] ? _spin_unlock_irq+0x2d/0x42 [ 7.819146] [] ? rtc_do_ioctl+0x11b/0x677 [ 7.819146] [] ? inode_has_perm+0x5e/0x68 [ 7.819146] [] ? lock_release_holdtime+0x50/0x56 [ 7.819146] [] ? native_sched_clock+0x8b/0x9f [ 7.819146] [] ? file_has_perm+0x83/0x8c [ 7.819146] [] ? rtc_ioctl+0xf/0x11 [ 7.819146] [] ? do_ioctl+0x55/0x67 [ 7.819146] [] sys_select+0x93/0x163 [ 7.819146] [] ? sysenter_past_esp+0x9a/0xa5 [ 7.819146] [] sysenter_past_esp+0x5f/0xa5 [ 7.819146] ======================= [ 7.819146] ---[ end trace 96540ca301ffb84c ]--- [ 7.819210] rtc: lost 6 interrupts [ 7.870668] type=1400 audit(1202128840.794:4): avc: denied { audit_write } for pid=399 comm="hwclock" capability=29 scontext=system_u:system_r:hwclock_t:s0 tcontext=system_u:system_r:hwclock_t:s0 tclass=capability [ 9.538866] input: PC Speaker as /class/input/input5 Because hpet_rtc_interrupt()'s call to get_rtc_time() ends up resolving to include/asm-generic/rtc.h's (hilariously inlined) get_rtc_time(), which does spin_unlock_irq() from hard IRQ context. The obvious patch fixes it. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/asm-generic/rtc.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/rtc.h b/include/asm-generic/rtc.h index d3238f1f70a6..dd1bed860e64 100644 --- a/include/asm-generic/rtc.h +++ b/include/asm-generic/rtc.h @@ -35,10 +35,11 @@ static inline unsigned char rtc_is_updating(void) { unsigned char uip; + unsigned long flags; - spin_lock_irq(&rtc_lock); + spin_lock_irqsave(&rtc_lock, flags); uip = (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); return uip; } @@ -46,6 +47,8 @@ static inline unsigned int get_rtc_time(struct rtc_time *time) { unsigned long uip_watchdog = jiffies; unsigned char ctrl; + unsigned long flags; + #ifdef CONFIG_MACH_DECSTATION unsigned int real_year; #endif @@ -72,7 +75,7 @@ static inline unsigned int get_rtc_time(struct rtc_time *time) * RTC has RTC_DAY_OF_WEEK, we ignore it, as it is only updated * by the RTC when initially set to a non-zero value. */ - spin_lock_irq(&rtc_lock); + spin_lock_irqsave(&rtc_lock, flags); time->tm_sec = CMOS_READ(RTC_SECONDS); time->tm_min = CMOS_READ(RTC_MINUTES); time->tm_hour = CMOS_READ(RTC_HOURS); @@ -83,7 +86,7 @@ static inline unsigned int get_rtc_time(struct rtc_time *time) real_year = CMOS_READ(RTC_DEC_YEAR); #endif ctrl = CMOS_READ(RTC_CONTROL); - spin_unlock_irq(&rtc_lock); + spin_unlock_irqrestore(&rtc_lock, flags); if (!(ctrl & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {