From 8234b22e1c3287307c53655b16478cf8f5071555 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 27 Dec 2010 12:08:45 +0200 Subject: [PATCH 01/55] KVM: MMU: Don't flush shadow when enabling dirty tracking Instead, drop large mappings, which were the reason we dropped shadow. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/mmu.c | 12 ++++++++---- virt/kvm/kvm_main.c | 7 +------ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index f02b8edc3d44..ccacf0b1b540 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3538,14 +3538,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (!test_bit(slot, sp->slot_bitmap)) continue; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) - continue; - pt = sp->spt; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (sp->role.level != PT_PAGE_TABLE_LEVEL + && is_large_pte(pt[i])) { + drop_spte(kvm, &pt[i], + shadow_trap_nonpresent_pte); + --kvm->stat.lpages; + } /* avoid RMW */ if (is_writable_pte(pt[i])) update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); + } } kvm_flush_remote_tlbs(kvm); } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index f29abeb6a912..a3a8f5fb369e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -621,7 +621,7 @@ int __kvm_set_memory_region(struct kvm *kvm, struct kvm_userspace_memory_region *mem, int user_alloc) { - int r, flush_shadow = 0; + int r; gfn_t base_gfn; unsigned long npages; unsigned long i; @@ -741,8 +741,6 @@ skip_lpage: if (kvm_create_dirty_bitmap(&new) < 0) goto out_free; /* destroy any largepage mappings for dirty tracking */ - if (old.npages) - flush_shadow = 1; } #else /* not defined CONFIG_S390 */ new.user_alloc = user_alloc; @@ -813,9 +811,6 @@ skip_lpage: kvm_free_physmem_slot(&old, &new); kfree(old_memslots); - if (flush_shadow) - kvm_arch_flush_shadow(kvm); - return 0; out_free: From 775077a0634a4a2149c85a63ec319b1e426d0564 Mon Sep 17 00:00:00 2001 From: Sedat Dilek Date: Mon, 3 Jan 2011 00:01:29 +0100 Subject: [PATCH 02/55] KVM guest: Fix section mismatch derived from kvm_guest_cpu_online() WARNING: arch/x86/built-in.o(.text+0x1bb74): Section mismatch in reference from the function kvm_guest_cpu_online() to the function .cpuinit.text:kvm_guest_cpu_init() The function kvm_guest_cpu_online() references the function __cpuinit kvm_guest_cpu_init(). This is often because kvm_guest_cpu_online lacks a __cpuinit annotation or the annotation of kvm_guest_cpu_init is wrong. This patch fixes the warning. Tested with linux-next (next-20101231) Signed-off-by: Sedat Dilek Acked-by: Rik van Riel Signed-off-by: Marcelo Tosatti --- arch/x86/kernel/kvm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8dc44662394b..33c07b0b122e 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void) native_smp_prepare_boot_cpu(); } -static void kvm_guest_cpu_online(void *dummy) +static void __cpuinit kvm_guest_cpu_online(void *dummy) { kvm_guest_cpu_init(); } From bc9c1933d9db227d991736382ddca21ed520c0ea Mon Sep 17 00:00:00 2001 From: Peter Tyser Date: Wed, 29 Dec 2010 13:51:25 -0600 Subject: [PATCH 03/55] KVM: PPC: Fix SPRG get/set for Book3S and BookE Previously SPRGs 4-7 were improperly read and written in kvm_arch_vcpu_ioctl_get_regs() and kvm_arch_vcpu_ioctl_set_regs(); Signed-off-by: Alexander Graf Signed-off-by: Peter Tyser Signed-off-by: Marcelo Tosatti --- arch/powerpc/kvm/book3s.c | 14 ++++++++------ arch/powerpc/kvm/booke.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index badc983031b3..c961de40c676 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -1141,9 +1141,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg5 = vcpu->arch.sprg4; - regs->sprg6 = vcpu->arch.sprg5; - regs->sprg7 = vcpu->arch.sprg6; + regs->sprg4 = vcpu->arch.sprg4; + regs->sprg5 = vcpu->arch.sprg5; + regs->sprg6 = vcpu->arch.sprg6; + regs->sprg7 = vcpu->arch.sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -1167,9 +1168,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg5 = regs->sprg4; - vcpu->arch.sprg6 = regs->sprg5; - vcpu->arch.sprg7 = regs->sprg6; + vcpu->arch.sprg4 = regs->sprg4; + vcpu->arch.sprg5 = regs->sprg5; + vcpu->arch.sprg6 = regs->sprg6; + vcpu->arch.sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 77575d08c818..ef76acb455c3 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -546,9 +546,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg3 = vcpu->arch.shared->sprg3; - regs->sprg5 = vcpu->arch.sprg4; - regs->sprg6 = vcpu->arch.sprg5; - regs->sprg7 = vcpu->arch.sprg6; + regs->sprg4 = vcpu->arch.sprg4; + regs->sprg5 = vcpu->arch.sprg5; + regs->sprg6 = vcpu->arch.sprg6; + regs->sprg7 = vcpu->arch.sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); @@ -572,9 +573,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg3 = regs->sprg3; - vcpu->arch.sprg5 = regs->sprg4; - vcpu->arch.sprg6 = regs->sprg5; - vcpu->arch.sprg7 = regs->sprg6; + vcpu->arch.sprg4 = regs->sprg4; + vcpu->arch.sprg5 = regs->sprg5; + vcpu->arch.sprg6 = regs->sprg6; + vcpu->arch.sprg7 = regs->sprg7; for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); From d0ba64f9b4b3e41e7b91681fe04a334bc8bfc8f5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 Jan 2011 14:28:51 +0200 Subject: [PATCH 04/55] KVM: VMX: Save and restore tr selector across mode switches When emulating real mode we play with tr hidden state, but leave tr.selector alone. That works well, except for save/restore, since loading TR writes it to the hidden state in vmx->rmode. Fix by also saving and restoring the tr selector; this makes things more consistent and allows migration to work during the early boot stages of Windows XP. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bf89ec2cfb82..a2e83a9729fc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1683,6 +1683,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); @@ -1756,6 +1757,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); From a9179499350d30446d8983d272a9157963604b0f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 3 Jan 2011 14:28:52 +0200 Subject: [PATCH 05/55] KVM: VMX: Avoid leaking fake realmode state to userspace When emulating real mode, we fake some state: - tr.base points to a fake vm86 tss - segment registers are made to conform to vm86 restrictions change vmx_get_segment() not to expose this fake state to userspace; instead, return the original state. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a2e83a9729fc..87ad551b525b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2032,23 +2032,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_writel(GUEST_CR4, hw_cr4); } -static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) -{ - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; - - return vmcs_readl(sf->base); -} - static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_save_segment *save; u32 ar; + if (vmx->rmode.vm86_active + && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES + || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS + || seg == VCPU_SREG_GS) + && !emulate_invalid_guest_state) { + switch (seg) { + case VCPU_SREG_TR: save = &vmx->rmode.tr; break; + case VCPU_SREG_ES: save = &vmx->rmode.es; break; + case VCPU_SREG_DS: save = &vmx->rmode.ds; break; + case VCPU_SREG_FS: save = &vmx->rmode.fs; break; + case VCPU_SREG_GS: save = &vmx->rmode.gs; break; + default: BUG(); + } + var->selector = save->selector; + var->base = save->base; + var->limit = save->limit; + ar = save->ar; + if (seg == VCPU_SREG_TR + || var->selector == vmcs_read16(sf->selector)) + goto use_saved_rmode_seg; + } var->base = vmcs_readl(sf->base); var->limit = vmcs_read32(sf->limit); var->selector = vmcs_read16(sf->selector); ar = vmcs_read32(sf->ar_bytes); +use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; var->type = ar & 15; @@ -2062,6 +2079,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->unusable = (ar >> 16) & 1; } +static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; + struct kvm_segment s; + + if (to_vmx(vcpu)->rmode.vm86_active) { + vmx_get_segment(vcpu, &s, seg); + return s.base; + } + return vmcs_readl(sf->base); +} + static int vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) From 3781c01c1574a2cd58fd5ba2bf58c710edff3da5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 14 Jan 2011 16:45:02 +0100 Subject: [PATCH 06/55] KVM: SVM: Add support for perf-kvm This patch adds the necessary code to run perf-kvm on AMD machines. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 63fec1531e89..bfb49489566e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3647,13 +3647,21 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) local_irq_disable(); - stgi(); - vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_before_handle_nmi(&svm->vcpu); + + stgi(); + + /* Any pending NMI will happen here */ + + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_after_handle_nmi(&svm->vcpu); + sync_cr8_to_lapic(vcpu); svm->next_rip = 0; From 00c25bce02fde63eaa1227532cff9ac7b71de79e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 4 Jan 2011 09:51:33 -0500 Subject: [PATCH 07/55] KVM: VMX: increase ple_gap default to 128 On some CPUs, a ple_gap of 41 is simply insufficient to ever trigger PLE exits, even with the minimalistic PLE test from kvm-unit-tests. http://git.kernel.org/?p=virt/kvm/kvm-unit-tests.git;a=commitdiff;h=eda71b28fa122203e316483b35f37aaacd42f545 For example, the Xeon X5670 CPU needs a ple_gap of at least 48 in order to get pause loop exits: # modprobe kvm_intel ple_gap=47 # taskset 1 /usr/local/bin/qemu-system-x86_64 \ -device testdev,chardev=log -chardev stdio,id=log \ -kernel x86/vmexit.flat -append ple-round-robin -smp 2 VNC server running on `::1:5900' enabling apic enabling apic ple-round-robin 58298446 # rmmod kvm_intel # modprobe kvm_intel ple_gap=48 # taskset 1 /usr/local/bin/qemu-system-x86_64 \ -device testdev,chardev=log -chardev stdio,id=log \ -kernel x86/vmexit.flat -append ple-round-robin -smp 2 VNC server running on `::1:5900' enabling apic enabling apic ple-round-robin 36616 Increase the ple_gap to 128 to be on the safe side. Signed-off-by: Rik van Riel Acked-by: Zhai, Edwin Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 87ad551b525b..e2c4e324f6fa 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO); * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive * executions of PAUSE in a loop. Also indicate if ple enabled. - * According to test, this time is usually small than 41 cycles. + * According to test, this time is usually smaller than 128 cycles. * ple_window: upper bound on the amount of time a guest is allowed to execute * in a PAUSE loop. Tests indicate that most spinlocks are held for * less than 2^12 cycles * Time is measured based on a counter that runs at the same rate as the TSC, * refer SDM volume 3b section 21.6.13 & 22.1.3. */ -#define KVM_VMX_DEFAULT_PLE_GAP 41 +#define KVM_VMX_DEFAULT_PLE_GAP 128 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096 static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; module_param(ple_gap, int, S_IRUGO); From 1c696d0e1b7c10e1e8b34cb6c797329e3c33f262 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 6 Jan 2011 18:09:11 +0200 Subject: [PATCH 08/55] KVM: VMX: Simplify saving guest rcx in vmx_vcpu_run Change push top-of-stack pop guest-rcx pop dummy to pop guest-rcx which is the same thing, only simpler. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e2c4e324f6fa..da468c26639d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4066,7 +4066,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "xchg %0, (%%"R"sp) \n\t" "mov %%"R"ax, %c[rax](%0) \n\t" "mov %%"R"bx, %c[rbx](%0) \n\t" - "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "pop"Q" %c[rcx](%0) \n\t" "mov %%"R"dx, %c[rdx](%0) \n\t" "mov %%"R"si, %c[rsi](%0) \n\t" "mov %%"R"di, %c[rdi](%0) \n\t" @@ -4084,7 +4084,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) "mov %%cr2, %%"R"ax \n\t" "mov %%"R"ax, %c[cr2](%0) \n\t" - "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" + "pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), From 40712faeb84dacfcb3925a88231daa08b3624d34 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 6 Jan 2011 18:09:12 +0200 Subject: [PATCH 09/55] KVM: VMX: Avoid atomic operation in vmx_vcpu_run Instead of exchanging the guest and host rcx, have separate storage for each. This allows us to avoid using the xchg instruction, which is is a little slower than normal operations. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index da468c26639d..ae4f02d47091 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4022,6 +4022,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" /* placeholder for guest rcx */ "push %%"R"cx \n\t" "cmp %%"R"sp, %c[host_rsp](%0) \n\t" "je 1f \n\t" @@ -4063,7 +4064,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ - "xchg %0, (%%"R"sp) \n\t" + "mov %0, %c[wordsize](%%"R"sp) \n\t" + "pop %0 \n\t" "mov %%"R"ax, %c[rax](%0) \n\t" "mov %%"R"bx, %c[rbx](%0) \n\t" "pop"Q" %c[rcx](%0) \n\t" @@ -4107,7 +4109,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), #endif - [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) + [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)), + [wordsize]"i"(sizeof(ulong)) : "cc", "memory" , R"ax", R"bx", R"di", R"si" #ifdef CONFIG_X86_64 From 63f42e023ea270fde65fa27f0ca766e13faa5608 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 12 Jan 2011 15:39:18 +0800 Subject: [PATCH 10/55] KVM: fix rcu usage warning in kvm_arch_vcpu_ioctl_set_sregs() Fix: [ 1001.499596] =================================================== [ 1001.499599] [ INFO: suspicious rcu_dereference_check() usage. ] [ 1001.499601] --------------------------------------------------- [ 1001.499604] include/linux/kvm_host.h:301 invoked rcu_dereference_check() without protection! ...... [ 1001.499636] Pid: 6035, comm: qemu-system-x86 Not tainted 2.6.37-rc6+ #62 [ 1001.499638] Call Trace: [ 1001.499644] [] lockdep_rcu_dereference+0x9d/0xa5 [ 1001.499653] [] gfn_to_memslot+0x8d/0xc8 [kvm] [ 1001.499661] [] gfn_to_hva+0x16/0x3f [kvm] [ 1001.499669] [] kvm_read_guest_page+0x1e/0x5e [kvm] [ 1001.499681] [] kvm_read_guest_page_mmu+0x53/0x5e [kvm] [ 1001.499699] [] load_pdptrs+0x3f/0x9c [kvm] [ 1001.499705] [] ? vmx_set_cr0+0x507/0x517 [kvm_intel] [ 1001.499717] [] kvm_arch_vcpu_ioctl_set_sregs+0x1f3/0x3c0 [kvm] [ 1001.499727] [] kvm_vcpu_ioctl+0x6a5/0xbc5 [kvm] Signed-off-by: Xiao Guangrong Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bcc0efce85bf..4be89e1c0147 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5574,7 +5574,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { int mmu_reset_needed = 0; - int pending_vec, max_bits; + int pending_vec, max_bits, idx; struct desc_ptr dt; dt.size = sregs->idt.limit; @@ -5603,10 +5603,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_x86_ops->set_cr4(vcpu, sregs->cr4); if (sregs->cr4 & X86_CR4_OSXSAVE) update_cpuid(vcpu); + + idx = srcu_read_lock(&vcpu->kvm->srcu); if (!is_long_mode(vcpu) && is_pae(vcpu)) { load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); mmu_reset_needed = 1; } + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (mmu_reset_needed) kvm_mmu_reset_context(vcpu); From 9ca523183166343fde060e2198237bb345b8a77d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sat, 15 Jan 2011 10:00:53 +0100 Subject: [PATCH 11/55] KVM: x86: Remove user space triggerable MCE error message This case is a pure user space error we do not need to record. Moreover, it can be misused to flood the kernel log. Remove it. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4be89e1c0147..5eccdba08bd0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2575,9 +2575,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { - printk(KERN_DEBUG "kvm: set_mce: " - "injects mce exception while " - "previous one is in progress!\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return 0; } From d48ead8b0b48862a87138d04efb7580a1a25beb5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 17 Jan 2011 21:21:08 +0100 Subject: [PATCH 12/55] KVM: fix build warning within __kvm_set_memory_region() on s390 Get rid of this warning: CC arch/s390/kvm/../../../virt/kvm/kvm_main.o arch/s390/kvm/../../../virt/kvm/kvm_main.c:596:12: warning: 'kvm_create_dirty_bitmap' defined but not used The only caller of the function is within a !CONFIG_S390 section, so add the same ifdef around kvm_create_dirty_bitmap() as well. Signed-off-by: Heiko Carstens Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index a3a8f5fb369e..fd67bcde9980 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -588,6 +588,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp) return 0; } +#ifndef CONFIG_S390 /* * Allocation size is twice as large as the actual dirty bitmap size. * This makes it possible to do double buffering: see x86's @@ -608,6 +609,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot) memslot->dirty_bitmap_head = memslot->dirty_bitmap; return 0; } +#endif /* !CONFIG_S390 */ /* * Allocate some memory and give it an address in the guest physical address From 6b7e2d0991489559a1df4500d77f7b76c4607ed0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 12 Jan 2011 15:40:31 +0800 Subject: [PATCH 13/55] KVM: Add "exiting guest mode" state Currently we keep track of only two states: guest mode and host mode. This patch adds an "exiting guest mode" state that tells us that an IPI will happen soon, so unless we need to wait for the IPI, we can avoid it completely. Also 1: No need atomically to read/write ->mode in vcpu's thread 2: reorganize struct kvm_vcpu to make ->mode and ->requests in the same cache line explicitly Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/ia64/kvm/kvm-ia64.c | 2 ++ arch/x86/kvm/x86.c | 16 ++++++++++------ include/linux/kvm_host.h | 22 +++++++++++++++++----- virt/kvm/kvm_main.c | 7 ++++++- 4 files changed, 35 insertions(+), 12 deletions(-) diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c index 70d224d4264c..8213efe1998c 100644 --- a/arch/ia64/kvm/kvm-ia64.c +++ b/arch/ia64/kvm/kvm-ia64.c @@ -662,6 +662,7 @@ again: goto vcpu_run_fail; srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu->mode = IN_GUEST_MODE; kvm_guest_enter(); /* @@ -683,6 +684,7 @@ again: */ barrier(); kvm_guest_exit(); + vcpu->mode = OUTSIDE_GUEST_MODE; preempt_enable(); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5eccdba08bd0..a7f65aa6eef6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5210,14 +5210,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_load_guest_fpu(vcpu); kvm_load_guest_xcr0(vcpu); - atomic_set(&vcpu->guest_mode, 1); - smp_wmb(); + vcpu->mode = IN_GUEST_MODE; + + /* We should set ->mode before check ->requests, + * see the comment in make_all_cpus_request. + */ + smp_mb(); local_irq_disable(); - if (!atomic_read(&vcpu->guest_mode) || vcpu->requests + if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests || need_resched() || signal_pending(current)) { - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); preempt_enable(); @@ -5253,7 +5257,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); - atomic_set(&vcpu->guest_mode, 0); + vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); local_irq_enable(); @@ -6157,7 +6161,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) - if (atomic_xchg(&vcpu->guest_mode, 0)) + if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE) smp_send_reschedule(cpu); put_cpu(); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b5021db21858..b99eacd988ab 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -98,19 +98,26 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif +enum { + OUTSIDE_GUEST_MODE, + IN_GUEST_MODE, + EXITING_GUEST_MODE +}; + struct kvm_vcpu { struct kvm *kvm; #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier preempt_notifier; #endif + int cpu; int vcpu_id; - struct mutex mutex; - int cpu; - atomic_t guest_mode; - struct kvm_run *run; + int srcu_idx; + int mode; unsigned long requests; unsigned long guest_debug; - int srcu_idx; + + struct mutex mutex; + struct kvm_run *run; int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; @@ -140,6 +147,11 @@ struct kvm_vcpu { struct kvm_vcpu_arch arch; }; +static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) +{ + return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); +} + /* * Some of the bitops functions do not support too long bitmaps. * This number must be determined not to exceed such limits. diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index fd67bcde9980..19209f849cf7 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -171,7 +171,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) if (kvm_make_check_request(req, vcpu)) continue; cpu = vcpu->cpu; - if (cpus != NULL && cpu != -1 && cpu != me) + + /* Set ->requests bit before we read ->mode */ + smp_mb(); + + if (cpus != NULL && cpu != -1 && cpu != me && + kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) cpumask_set_cpu(cpu, cpus); } if (unlikely(cpus == NULL)) From 3cba41307a2b1344ab8c1b9f55202d1e9d7bf81b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 12 Jan 2011 15:41:22 +0800 Subject: [PATCH 14/55] KVM: make make_all_cpus_request() lockless Now, we have 'vcpu->mode' to judge whether need to send ipi to other cpus, this way is very exact, so checking request bit is needless, then we can drop the spinlock let it's collateral Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 6 ------ virt/kvm/kvm_main.c | 9 +++------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index b99eacd988ab..c8dee22b1945 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -224,7 +224,6 @@ struct kvm_memslots { struct kvm { spinlock_t mmu_lock; - raw_spinlock_t requests_lock; struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots *memslots; @@ -731,11 +730,6 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) set_bit(req, &vcpu->requests); } -static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu) -{ - return test_and_set_bit(req, &vcpu->requests); -} - static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { if (test_bit(req, &vcpu->requests)) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 19209f849cf7..4856a7dcbd7f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -165,11 +165,9 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) zalloc_cpumask_var(&cpus, GFP_ATOMIC); - raw_spin_lock(&kvm->requests_lock); - me = smp_processor_id(); + me = get_cpu(); kvm_for_each_vcpu(i, vcpu, kvm) { - if (kvm_make_check_request(req, vcpu)) - continue; + kvm_make_request(req, vcpu); cpu = vcpu->cpu; /* Set ->requests bit before we read ->mode */ @@ -185,7 +183,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) smp_call_function_many(cpus, ack_flush, NULL, 1); else called = false; - raw_spin_unlock(&kvm->requests_lock); + put_cpu(); free_cpumask_var(cpus); return called; } @@ -468,7 +466,6 @@ static struct kvm *kvm_create_vm(void) kvm->mm = current->mm; atomic_inc(&kvm->mm->mm_count); spin_lock_init(&kvm->mmu_lock); - raw_spin_lock_init(&kvm->requests_lock); kvm_eventfd_init(kvm); mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); From 91c9c3eda4f3066980d13a6907ef84f3a99364bd Mon Sep 17 00:00:00 2001 From: john cooper Date: Fri, 21 Jan 2011 00:21:00 -0500 Subject: [PATCH 15/55] KVM: x86: handle guest access to BBL_CR_CTL3 MSR A correction to Intel cpu model CPUID data (patch queued) caused winxp to BSOD when booted with a Penryn model. This was traced to the CPUID "model" field correction from 6 -> 23 (as is proper for a Penryn class of cpu). Only in this case does the problem surface. The cause for this failure is winxp accessing the BBL_CR_CTL3 MSR which is unsupported by current kvm, appears to be a legacy MSR not fully characterized yet existing in current silicon, and is apparently carried forward in MSR space to accommodate vintage code as here. It is not yet conclusive whether this MSR implements any of its legacy functionality or is just an ornamental dud for compatibility. While I found no silicon version specific documentation link to this MSR, a general description exists in Intel's developer's reference which agrees with the functional behavior of other bootloader/kernel code I've examined accessing BBL_CR_CTL3. Regrettably winxp appears to be setting bit #19 called out as "reserved" in the above document. So to minimally accommodate this MSR, kvm msr get will provide the equivalent mock data and kvm msr write will simply toss the guest passed data without interpretation. While this treatment of BBL_CR_CTL3 addresses the immediate problem, the approach may be modified pending clarification from Intel. Signed-off-by: john cooper Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/msr-index.h | 1 + arch/x86/kvm/x86.c | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 823d48223400..fd5a1f365c95 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -43,6 +43,7 @@ #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7f65aa6eef6..7faf262ab202 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1592,6 +1592,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) } else return set_msr_hyperv(vcpu, msr, data); break; + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this legacy MSR -- see rdmsr + * counterpart for further detail. + */ + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1846,6 +1852,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) } else return get_msr_hyperv(vcpu, msr, pdata); break; + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't fully documented in current + * silicon. It is however accessed by winxp in very narrow + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + data = 0xbe702111; + break; default: if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); From 0014bd990e69063b0fb78940b35439d7980ce3ee Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 30 Jan 2011 11:15:47 +0800 Subject: [PATCH 16/55] mm: export __get_user_pages In most cases, get_user_pages and get_user_pages_fast should be used to pin user pages in memory. But sometimes, some special flags except FOLL_GET, FOLL_WRITE and FOLL_FORCE are needed, for example in following patch, KVM needs FOLL_HWPOISON. To support these users, __get_user_pages is exported directly. There are some symbol name conflicts in infiniband driver, fixed them too. Signed-off-by: Huang Ying CC: Andrew Morton CC: Michel Lespinasse CC: Roland Dreier CC: Ralph Campbell Signed-off-by: Marcelo Tosatti --- .../infiniband/hw/ipath/ipath_user_pages.c | 6 +-- drivers/infiniband/hw/qib/qib_user_pages.c | 6 +-- include/linux/mm.h | 4 ++ mm/internal.h | 5 -- mm/memory.c | 50 +++++++++++++++++++ 5 files changed, 60 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index bab9f74c0665..cfed5399f074 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -53,8 +53,8 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages, } /* call with current->mm->mmap_sem held */ -static int __get_user_pages(unsigned long start_page, size_t num_pages, - struct page **p, struct vm_area_struct **vma) +static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p, struct vm_area_struct **vma) { unsigned long lock_limit; size_t got; @@ -165,7 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages, down_write(¤t->mm->mmap_sem); - ret = __get_user_pages(start_page, num_pages, p, NULL); + ret = __ipath_get_user_pages(start_page, num_pages, p, NULL); up_write(¤t->mm->mmap_sem); diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index d7a26c1d4f37..7689e49c13c9 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -51,8 +51,8 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, /* * Call with current->mm->mmap_sem held. */ -static int __get_user_pages(unsigned long start_page, size_t num_pages, - struct page **p, struct vm_area_struct **vma) +static int __qib_get_user_pages(unsigned long start_page, size_t num_pages, + struct page **p, struct vm_area_struct **vma) { unsigned long lock_limit; size_t got; @@ -136,7 +136,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages, down_write(¤t->mm->mmap_sem); - ret = __get_user_pages(start_page, num_pages, p, NULL); + ret = __qib_get_user_pages(start_page, num_pages, p, NULL); up_write(¤t->mm->mmap_sem); diff --git a/include/linux/mm.h b/include/linux/mm.h index 679300c050f5..46150c66318e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -965,6 +965,10 @@ static inline int handle_mm_fault(struct mm_struct *mm, extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, unsigned int foll_flags, + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas); diff --git a/mm/internal.h b/mm/internal.h index 69488205723d..3438dd43a062 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -245,11 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas, - int *nonblocking); - #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_SOME 0 diff --git a/mm/memory.c b/mm/memory.c index 5823698c2b71..806a37ec71bd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1410,6 +1410,55 @@ no_page_table: return page; } +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, @@ -1578,6 +1627,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (nr_pages); return i; } +EXPORT_SYMBOL(__get_user_pages); /** * get_user_pages() - pin user pages in memory From 69ebb83e13e514222b0ae4f8bd813a17679ed876 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 30 Jan 2011 11:15:48 +0800 Subject: [PATCH 17/55] mm: make __get_user_pages return -EHWPOISON for HWPOISON page optionally Make __get_user_pages return -EHWPOISON for HWPOISON page only if FOLL_HWPOISON is specified. With this patch, the interested callers can distinguish HWPOISON pages from general FAULT pages, while other callers will still get -EFAULT for all these pages, so the user space interface need not to be changed. This feature is needed by KVM, where UCR MCE should be relayed to guest for HWPOISON page, while instruction emulation and MMIO will be tried for general FAULT page. The idea comes from Andrew Morton. Signed-off-by: Huang Ying Cc: Andrew Morton Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/alpha/include/asm/errno.h | 2 ++ arch/mips/include/asm/errno.h | 2 ++ arch/parisc/include/asm/errno.h | 2 ++ arch/sparc/include/asm/errno.h | 2 ++ include/asm-generic/errno.h | 2 ++ include/linux/mm.h | 1 + mm/memory.c | 13 ++++++++++--- 7 files changed, 21 insertions(+), 3 deletions(-) diff --git a/arch/alpha/include/asm/errno.h b/arch/alpha/include/asm/errno.h index 98099bda9370..e5f29ca28180 100644 --- a/arch/alpha/include/asm/errno.h +++ b/arch/alpha/include/asm/errno.h @@ -122,4 +122,6 @@ #define ERFKILL 138 /* Operation not possible due to RF-kill */ +#define EHWPOISON 139 /* Memory page has hardware error */ + #endif diff --git a/arch/mips/include/asm/errno.h b/arch/mips/include/asm/errno.h index a0efc73819e4..6dcd3583ed04 100644 --- a/arch/mips/include/asm/errno.h +++ b/arch/mips/include/asm/errno.h @@ -121,6 +121,8 @@ #define ERFKILL 167 /* Operation not possible due to RF-kill */ +#define EHWPOISON 168 /* Memory page has hardware error */ + #define EDQUOT 1133 /* Quota exceeded */ #ifdef __KERNEL__ diff --git a/arch/parisc/include/asm/errno.h b/arch/parisc/include/asm/errno.h index 9992abdd782d..135ad6047e51 100644 --- a/arch/parisc/include/asm/errno.h +++ b/arch/parisc/include/asm/errno.h @@ -122,4 +122,6 @@ #define ERFKILL 256 /* Operation not possible due to RF-kill */ +#define EHWPOISON 257 /* Memory page has hardware error */ + #endif diff --git a/arch/sparc/include/asm/errno.h b/arch/sparc/include/asm/errno.h index 4e2bc490d714..c351aba997b7 100644 --- a/arch/sparc/include/asm/errno.h +++ b/arch/sparc/include/asm/errno.h @@ -112,4 +112,6 @@ #define ERFKILL 134 /* Operation not possible due to RF-kill */ +#define EHWPOISON 135 /* Memory page has hardware error */ + #endif diff --git a/include/asm-generic/errno.h b/include/asm-generic/errno.h index 28cc03bf19e6..a1331ce50445 100644 --- a/include/asm-generic/errno.h +++ b/include/asm-generic/errno.h @@ -108,4 +108,6 @@ #define ERFKILL 132 /* Operation not possible due to RF-kill */ +#define EHWPOISON 133 /* Memory page has hardware error */ + #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index 46150c66318e..a77c82c56e05 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1532,6 +1532,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ +#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index 806a37ec71bd..346ee7e041fd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1576,9 +1576,16 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| - VM_FAULT_SIGBUS)) + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) return i ? i : -EFAULT; BUG(); } From fafc3dbaac6447ab8f78d2f7f32a521b24fc6b36 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 30 Jan 2011 11:15:49 +0800 Subject: [PATCH 18/55] KVM: Replace is_hwpoison_address with __get_user_pages is_hwpoison_address only checks whether the page table entry is hwpoisoned, regardless the memory page mapped. While __get_user_pages will check both. QEMU will clear the poisoned page table entry (via unmap/map) to make it possible to allocate a new memory page for the virtual address across guest rebooting. But it is also possible that the underlying memory page is kept poisoned even after the corresponding page table entry is cleared, that is, a new memory page can not be allocated. __get_user_pages can catch these situations. Signed-off-by: Huang Ying Signed-off-by: Marcelo Tosatti --- virt/kvm/kvm_main.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4856a7dcbd7f..002fe0b12c9f 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1028,6 +1028,15 @@ static pfn_t get_fault_pfn(void) return fault_pfn; } +static inline int check_user_page_hwpoison(unsigned long addr) +{ + int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE; + + rc = __get_user_pages(current, current->mm, addr, 1, + flags, NULL, NULL, NULL); + return rc == -EHWPOISON; +} + static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, bool *async, bool write_fault, bool *writable) { @@ -1075,7 +1084,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, return get_fault_pfn(); down_read(¤t->mm->mmap_sem); - if (is_hwpoison_address(addr)) { + if (check_user_page_hwpoison(addr)) { up_read(¤t->mm->mmap_sem); get_page(hwpoison_page); return page_to_pfn(hwpoison_page); From f58c9df78c0360f0eb3852b9cc3a61e689bc2dd1 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Sun, 30 Jan 2011 11:15:49 +0800 Subject: [PATCH 19/55] mm: remove is_hwpoison_address Unused. Signed-off-by: Huang Ying Signed-off-by: Marcelo Tosatti --- include/linux/mm.h | 8 -------- mm/memory-failure.c | 32 -------------------------------- 2 files changed, 40 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a77c82c56e05..78219887308e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1625,14 +1625,6 @@ extern int sysctl_memory_failure_recovery; extern void shake_page(struct page *p, int access); extern atomic_long_t mce_bad_pages; extern int soft_offline_page(struct page *page, int flags); -#ifdef CONFIG_MEMORY_FAILURE -int is_hwpoison_address(unsigned long addr); -#else -static inline int is_hwpoison_address(unsigned long addr) -{ - return 0; -} -#endif extern void dump_page(struct page *page); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0207c2f6f8bd..99ccb4472623 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1487,35 +1487,3 @@ done: /* keep elevated page count for bad page */ return ret; } - -/* - * The caller must hold current->mm->mmap_sem in read mode. - */ -int is_hwpoison_address(unsigned long addr) -{ - pgd_t *pgdp; - pud_t pud, *pudp; - pmd_t pmd, *pmdp; - pte_t pte, *ptep; - swp_entry_t entry; - - pgdp = pgd_offset(current->mm, addr); - if (!pgd_present(*pgdp)) - return 0; - pudp = pud_offset(pgdp, addr); - pud = *pudp; - if (!pud_present(pud) || pud_large(pud)) - return 0; - pmdp = pmd_offset(pudp, addr); - pmd = *pmdp; - if (!pmd_present(pmd) || pmd_large(pmd)) - return 0; - ptep = pte_offset_map(pmdp, addr); - pte = *ptep; - pte_unmap(ptep); - if (!is_swap_pte(pte)) - return 0; - entry = pte_to_swp_entry(pte); - return is_hwpoison_entry(entry); -} -EXPORT_SYMBOL_GPL(is_hwpoison_address); From 12f9a48f7bf5bfe6620b03028a865f26a10e1fce Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 1 Feb 2011 14:16:40 -0500 Subject: [PATCH 20/55] KVM: x86: release kvmclock page on reset When a vcpu is reset, kvmclock page keeps being written to this days. This is wrong and inconsistent: a cpu reset should take it to its initial state. Signed-off-by: Glauber Costa CC: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7faf262ab202..712af904706f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1453,6 +1453,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } +static void kvmclock_reset(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.time_page) { + kvm_release_page_dirty(vcpu->arch.time_page); + vcpu->arch.time_page = NULL; + } +} + int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { switch (msr) { @@ -1510,10 +1518,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); vcpu->arch.time = data; kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -5837,10 +5842,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); fx_free(vcpu); @@ -5901,6 +5903,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.apf.msr_val = 0; + kvmclock_reset(vcpu); + kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); vcpu->arch.apf.halted = false; From 0bb8865979629a9dfc6761f8f4d47926d539daa0 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 1 Feb 2011 13:27:15 +0100 Subject: [PATCH 21/55] KVM: x86: Drop obsolete warning about INIT on runnable VCPU This warning was once used for debugging QEMU user space. Though uncommon, it is actually possible to send an INIT request to a running VCPU. So better drop this warning before someone misuses it to flood kernel logs this way. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/lapic.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 93cf9d0d3653..0171e66e9994 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_INIT: if (level) { result = 1; - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - printk(KERN_DEBUG - "INIT on a runnable vcpu %d\n", - vcpu->vcpu_id); vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); From 3e90943907ff84cf0379a57c01c296a9b33c903e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Feb 2011 16:32:02 +0200 Subject: [PATCH 22/55] KVM: Drop bogus x86_decode_insn() error check x86_decode_insn() doesn't return X86EMUL_* values, so the check for X86EMUL_PROPOGATE_FAULT will always fail. There is a proper check later on, so there is no need for a replacement for this code. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 712af904706f..dd8016d2efa9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4412,8 +4412,6 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.perm_ok = false; r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); - if (r == X86EMUL_PROPAGATE_FAULT) - goto done; trace_kvm_emulate_insn_start(vcpu); @@ -4473,7 +4471,6 @@ restart: return handle_emulation_failure(vcpu); } -done: if (vcpu->arch.emulate_ctxt.have_exception) { inject_emulated_exception(vcpu); r = EMULATE_DONE; From d867162c6d1028d16358f4d2383d1833a849c74d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Feb 2011 16:32:03 +0200 Subject: [PATCH 23/55] KVM: x86 emulator: vendor specific instructions Mark some instructions as vendor specific, and allow the caller to request emulation only of vendor specific instructions. This is useful in some circumstances (responding to a #UD fault). Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 8e37deb1eb38..50ebc327a368 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -239,6 +239,7 @@ struct x86_emulate_ctxt { int interruptibility; bool perm_ok; /* do not check permissions if true */ + bool only_vendor_specific_insn; bool have_exception; struct x86_exception exception; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index caf966781d25..a90d7e033304 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -76,6 +76,7 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ /* Misc flags */ +#define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Undefined (1<<25) /* No Such Instruction */ @@ -2365,7 +2366,8 @@ static struct group_dual group7 = { { D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem | ModRM | ByteOp | Priv | NoAccess), }, { - D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), + D(SrcNone | ModRM | Priv | VendorSpecific), N, + N, D(SrcNone | ModRM | Priv | VendorSpecific), D(SrcNone | ModRM | DstMem | Mov), N, D(SrcMem16 | ModRM | Mov | Priv), N, } }; @@ -2489,7 +2491,7 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ N, GD(0, &group7), N, N, - N, D(ImplicitOps), D(ImplicitOps | Priv), N, + N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ @@ -2502,7 +2504,8 @@ static struct opcode twobyte_table[256] = { /* 0x30 - 0x3F */ D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), D(ImplicitOps | Priv), N, - D(ImplicitOps), D(ImplicitOps | Priv), N, N, + D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), + N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ X16(D(DstReg | SrcMem | ModRM | Mov)), @@ -2741,6 +2744,9 @@ done_prefixes: if (c->d == 0 || (c->d & Undefined)) return -1; + if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn) + return -1; + if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) c->op_bytes = 8; From 4005996e428b0d3df10cd0eba389a14b9f5403e4 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Feb 2011 16:32:04 +0200 Subject: [PATCH 24/55] KVM: Drop ad-hoc vendor specific instruction restriction Use the new support in the emulator, and drop the ad-hoc code in x86.c. Signed-off-by: Avi Kivity Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/x86.c | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd8016d2efa9..8575d85202d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4411,39 +4411,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, vcpu->arch.emulate_ctxt.have_exception = false; vcpu->arch.emulate_ctxt.perm_ok = false; + vcpu->arch.emulate_ctxt.only_vendor_specific_insn + = emulation_type & EMULTYPE_TRAP_UD; + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); trace_kvm_emulate_insn_start(vcpu); - - /* Only allow emulation of specific instructions on #UD - * (namely VMMCALL, sysenter, sysexit, syscall)*/ - if (emulation_type & EMULTYPE_TRAP_UD) { - if (!c->twobyte) - return EMULATE_FAIL; - switch (c->b) { - case 0x01: /* VMMCALL */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - return EMULATE_FAIL; - break; - case 0x34: /* sysenter */ - case 0x35: /* sysexit */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - case 0x05: /* syscall */ - if (c->modrm_mod != 0 || c->modrm_rm != 0) - return EMULATE_FAIL; - break; - default: - return EMULATE_FAIL; - } - - if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) - return EMULATE_FAIL; - } - ++vcpu->stat.insn_emulation; if (r) { + if (emulation_type & EMULTYPE_TRAP_UD) + return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (emulation_type & EMULTYPE_SKIP) From 77c100c83e84316ced2507c5799f79c2c80bc6b9 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:51:46 -0500 Subject: [PATCH 25/55] export pid symbols needed for kvm_vcpu_on_spin Export the symbols required for a race-free kvm_vcpu_on_spin. Signed-off-by: Rik van Riel Signed-off-by: Avi Kivity --- kernel/fork.c | 1 + kernel/pid.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/kernel/fork.c b/kernel/fork.c index 25e429152ddc..05b92c457010 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk) if (!profile_handoff_task(tsk)) free_task(tsk); } +EXPORT_SYMBOL_GPL(__put_task_struct); /* * macro override instead of weak attribute alias, to workaround diff --git a/kernel/pid.c b/kernel/pid.c index 39b65b69584f..02f221274265 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_unlock(); return pid; } +EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { @@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) rcu_read_unlock(); return result; } +EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { From 34bb10b79de7df118de832f6832efb630e646577 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:52:41 -0500 Subject: [PATCH 26/55] KVM: keep track of which task is running a KVM vcpu Keep track of which task is running a KVM vcpu. This helps us figure out later what task to wake up if we want to boost a vcpu that got preempted. Unfortunately there are no guarantees that the same task always keeps the same vcpu, so we can only track the task across a single "run" of the vcpu. Signed-off-by: Rik van Riel Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c8dee22b1945..4721b11b922a 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -122,6 +122,7 @@ struct kvm_vcpu { int fpu_active; int guest_fpu_loaded, guest_xcr0_loaded; wait_queue_head_t wq; + struct pid *pid; int sigset_active; sigset_t sigset; struct kvm_vcpu_stat stat; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 002fe0b12c9f..bc8bfd15ab71 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -137,6 +137,14 @@ void vcpu_load(struct kvm_vcpu *vcpu) int cpu; mutex_lock(&vcpu->mutex); + if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) { + /* The thread running this VCPU changed. */ + struct pid *oldpid = vcpu->pid; + struct pid *newpid = get_task_pid(current, PIDTYPE_PID); + rcu_assign_pointer(vcpu->pid, newpid); + synchronize_rcu(); + put_pid(oldpid); + } cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); @@ -212,6 +220,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->cpu = -1; vcpu->kvm = kvm; vcpu->vcpu_id = id; + vcpu->pid = NULL; init_waitqueue_head(&vcpu->wq); kvm_async_pf_vcpu_init(vcpu); @@ -236,6 +245,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) { + put_pid(vcpu->pid); kvm_arch_vcpu_uninit(vcpu); free_page((unsigned long)vcpu->run); } From 217ece6129f2d3b4fdd18d9e79be9e43d8d14a42 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 1 Feb 2011 09:53:28 -0500 Subject: [PATCH 27/55] KVM: use yield_to instead of sleep in kvm_vcpu_on_spin Instead of sleeping in kvm_vcpu_on_spin, which can cause gigantic slowdowns of certain workloads, we instead use yield_to to get another VCPU in the same KVM guest to run sooner. This seems to give a 10-15% speedup in certain workloads. Signed-off-by: Rik van Riel Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 57 +++++++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 4721b11b922a..3751ea0d1f92 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -235,6 +235,7 @@ struct kvm { #endif struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; atomic_t online_vcpus; + int last_boosted_vcpu; struct list_head vm_list; struct mutex lock; struct kvm_io_bus *buses[KVM_NR_BUSES]; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bc8bfd15ab71..2dc53a6dc285 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1484,18 +1484,55 @@ void kvm_resched(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_resched); -void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) +void kvm_vcpu_on_spin(struct kvm_vcpu *me) { - ktime_t expires; - DEFINE_WAIT(wait); + struct kvm *kvm = me->kvm; + struct kvm_vcpu *vcpu; + int last_boosted_vcpu = me->kvm->last_boosted_vcpu; + int yielded = 0; + int pass; + int i; - prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); - - /* Sleep for 100 us, and hope lock-holder got scheduled */ - expires = ktime_add_ns(ktime_get(), 100000UL); - schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); - - finish_wait(&vcpu->wq, &wait); + /* + * We boost the priority of a VCPU that is runnable but not + * currently running, because it got preempted by something + * else and called schedule in __vcpu_run. Hopefully that + * VCPU is holding the lock that we need and will release it. + * We approximate round-robin by starting at the last boosted VCPU. + */ + for (pass = 0; pass < 2 && !yielded; pass++) { + kvm_for_each_vcpu(i, vcpu, kvm) { + struct task_struct *task = NULL; + struct pid *pid; + if (!pass && i < last_boosted_vcpu) { + i = last_boosted_vcpu; + continue; + } else if (pass && i > last_boosted_vcpu) + break; + if (vcpu == me) + continue; + if (waitqueue_active(&vcpu->wq)) + continue; + rcu_read_lock(); + pid = rcu_dereference(vcpu->pid); + if (pid) + task = get_pid_task(vcpu->pid, PIDTYPE_PID); + rcu_read_unlock(); + if (!task) + continue; + if (task->flags & PF_VCPU) { + put_task_struct(task); + continue; + } + if (yield_to(task, 1)) { + put_task_struct(task); + kvm->last_boosted_vcpu = i; + yielded = 1; + break; + } + put_task_struct(task); + } + } } EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); From f86368493ec038218e8663cc1b6e5393cd8e008a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Feb 2011 15:07:07 +0200 Subject: [PATCH 28/55] KVM: Fix race between nmi injection and enabling nmi window The interrupt injection logic looks something like if an nmi is pending, and nmi injection allowed inject nmi if an nmi is pending request exit on nmi window the problem is that "nmi is pending" can be set asynchronously by the PIT; if it happens to fire between the two if statements, we will request an nmi window even though nmi injection is allowed. On SVM, this has disasterous results, since it causes eflags.TF to be set in random guest code. The fix is simple; make nmi_pending synchronous using the standard vcpu->requests mechanism; this ensures the code above is completely synchronous wrt nmi_pending. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 +++- include/linux/kvm_host.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8575d85202d7..bd59e8ede88e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -360,8 +360,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { + kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -5180,6 +5180,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3751ea0d1f92..ab428552af8e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -43,6 +43,7 @@ #define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_EVENT 11 #define KVM_REQ_APF_HALT 12 +#define KVM_REQ_NMI 13 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 From bd3d1ec3d26b61120bb4f60b18ee99aa81839e6b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Feb 2011 15:29:52 +0200 Subject: [PATCH 29/55] KVM: SVM: check for progress after IRET interception When we enable an NMI window, we ask for an IRET intercept, since the IRET re-enables NMIs. However, the IRET intercept happens before the instruction executes, while the NMI window architecturally opens afterwards. To compensate for this mismatch, we only open the NMI window in the following exit, assuming that the IRET has by then executed; however, this assumption is not always correct; we may exit due to a host interrupt or page fault, without having executed the instruction. Fix by checking for forward progress by recording and comparing the IRET's rip. This is somewhat of a hack, since an unchaging rip does not mean that no forward progress has been made, but is the simplest fix for now. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bfb49489566e..8d61df4a02c7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -135,6 +135,8 @@ struct vcpu_svm { u32 *msrpm; + ulong nmi_iret_rip; + struct nested_state nested; bool nmi_singlestep; @@ -2653,6 +2655,7 @@ static int iret_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.nmi_window_exits; clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); return 1; } @@ -3474,7 +3477,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm) svm->int3_injected = 0; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) { + /* + * If we've made progress since setting HF_IRET_MASK, we've + * executed an IRET and can allow NMI injection. + */ + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) + && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) { svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); } From e935b8372cf8c63dc618a9f2b24ab360a225f1cd Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 8 Feb 2011 12:55:33 +0100 Subject: [PATCH 30/55] KVM: Convert kvm_lock to raw_spinlock Code under this lock requires non-preemptibility. Ensure this also over -rt by converting it to raw spinlock. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 4 ++-- arch/x86/kvm/x86.c | 4 ++-- virt/kvm/kvm_main.c | 36 ++++++++++++++++----------------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ffd7f8d29187..a58aebef5188 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -85,7 +85,7 @@ #define ASYNC_PF_PER_VCPU 64 -extern spinlock_t kvm_lock; +extern raw_spinlock_t kvm_lock; extern struct list_head vm_list; struct kvm_vcpu; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ccacf0b1b540..b6a9963400a7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3587,7 +3587,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (nr_to_scan == 0) goto out; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { int idx, freed_pages; @@ -3610,7 +3610,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); out: return percpu_counter_read_positive(&kvm_total_used_mmu_pages); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd59e8ede88e..d9855b8584cf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4557,7 +4557,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) @@ -4567,7 +4567,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va send_ipi = 1; } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); if (freq->old < freq->new && send_ipi) { /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 2dc53a6dc285..1fa0d292119a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -69,7 +69,7 @@ MODULE_LICENSE("GPL"); * kvm->lock --> kvm->slots_lock --> kvm->irq_lock */ -DEFINE_SPINLOCK(kvm_lock); +DEFINE_RAW_SPINLOCK(kvm_lock); LIST_HEAD(vm_list); static cpumask_var_t cpus_hardware_enabled; @@ -481,9 +481,9 @@ static struct kvm *kvm_create_vm(void) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); atomic_set(&kvm->users_count, 1); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_add(&kvm->vm_list, &vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return kvm; @@ -556,9 +556,9 @@ static void kvm_destroy_vm(struct kvm *kvm) struct mm_struct *mm = kvm->mm; kvm_arch_sync_events(kvm); - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_del(&kvm->vm_list); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); kvm_free_irq_routing(kvm); for (i = 0; i < KVM_NR_BUSES; i++) kvm_io_bus_destroy(kvm->buses[i]); @@ -2177,9 +2177,9 @@ static void hardware_enable_nolock(void *junk) static void hardware_enable(void *junk) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_enable_nolock(junk); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static void hardware_disable_nolock(void *junk) @@ -2194,9 +2194,9 @@ static void hardware_disable_nolock(void *junk) static void hardware_disable(void *junk) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_disable_nolock(junk); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static void hardware_disable_all_nolock(void) @@ -2210,16 +2210,16 @@ static void hardware_disable_all_nolock(void) static void hardware_disable_all(void) { - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); hardware_disable_all_nolock(); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); } static int hardware_enable_all(void) { int r = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); kvm_usage_count++; if (kvm_usage_count == 1) { @@ -2232,7 +2232,7 @@ static int hardware_enable_all(void) } } - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return r; } @@ -2394,10 +2394,10 @@ static int vm_stat_get(void *_offset, u64 *val) struct kvm *kvm; *val = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) *val += *(u32 *)((void *)kvm + offset); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return 0; } @@ -2411,12 +2411,12 @@ static int vcpu_stat_get(void *_offset, u64 *val) int i; *val = 0; - spin_lock(&kvm_lock); + raw_spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) kvm_for_each_vcpu(i, vcpu, kvm) *val += *(u32 *)((void *)vcpu + offset); - spin_unlock(&kvm_lock); + raw_spin_unlock(&kvm_lock); return 0; } @@ -2457,7 +2457,7 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state) static int kvm_resume(struct sys_device *dev) { if (kvm_usage_count) { - WARN_ON(spin_is_locked(&kvm_lock)); + WARN_ON(raw_spin_is_locked(&kvm_lock)); hardware_enable_nolock(NULL); } return 0; From 23f3e99132aa5c5f887648c50597d8f2b5800417 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 8 Feb 2011 11:45:56 -0800 Subject: [PATCH 31/55] KVM: VMX: fix detection of BIOS disabling VMX This patch fixes the logic used to detect whether BIOS has disabled VMX, for the case where VMX is enabled only under SMX, but tboot is not active. Signed-off-by: Joseph Cihula Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ae4f02d47091..f76137ce71c6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void) rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); if (msr & FEATURE_CONTROL_LOCKED) { + /* launched w/ TXT and VMX disabled */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && tboot_enabled()) return 1; + /* launched w/o TXT and VMX only enabled w/ TXT */ if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) && !tboot_enabled()) { printk(KERN_WARNING "kvm: disable TXT in the BIOS or " - " activate TXT before enabling KVM\n"); + "activate TXT before enabling KVM\n"); return 1; } + /* launched w/o TXT and VMX disabled */ + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; } return 0; - /* locked but not enabled */ } static void kvm_cpu_vmxon(u64 addr) From 7049467b5383432e178cc4124e53aebd83638916 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Wed, 9 Feb 2011 12:09:46 +0200 Subject: [PATCH 32/55] KVM: remove isr_ack logic from PIC isr_ack logic was added by e48258009d to avoid unnecessary IPIs. Back then it made sense, but now the code checks that vcpu is ready to accept interrupt before sending IPI, so this logic is no longer needed. The patch removes it. Fixes a regression with Debian/Hurd. Signed-off-by: Gleb Natapov Reported-and-tested-by: Jonathan Nieder Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 25 ++----------------------- arch/x86/kvm/x86.c | 4 ---- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 3cece05e4ac4..19fe855e7953 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -61,9 +61,6 @@ static void pic_unlock(struct kvm_pic *s) } } - if (!found) - found = s->kvm->bsp_vcpu; - if (!found) return; @@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s) static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); - s->isr_ack |= (1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; /* @@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) pic_lock(s->pics_state); } -void kvm_pic_clear_isr_ack(struct kvm *kvm) -{ - struct kvm_pic *s = pic_irqchip(kvm); - - pic_lock(s); - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; - pic_unlock(s); -} - /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->irr = 0; s->imr = 0; s->isr = 0; - s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_vcpu *vcpu = kvm->bsp_vcpu; struct kvm_pic *s = pic_irqchip(kvm); - int irq = pic_get_irq(&s->pics[0]); - s->output = level; - if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { - s->pics[0].isr_ack &= ~(1 << irq); + if (!s->output) s->wakeup_needed = true; - } + s->output = level; } static const struct kvm_io_device_ops picdev_ops = { @@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) s->pics[1].elcr_mask = 0xde; s->pics[0].pics_state = s; s->pics[1].pics_state = s; - s->pics[0].isr_ack = 0xff; - s->pics[1].isr_ack = 0xff; /* * Initialize PIO device diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d9855b8584cf..9000829d06cb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2669,8 +2669,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.soft = events->interrupt.soft; - if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) kvm_x86_ops->set_interrupt_shadow(vcpu, events->interrupt.shadow); @@ -5621,8 +5619,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, if (pending_vec < max_bits) { kvm_queue_interrupt(vcpu, pending_vec, false); pr_debug("Set back pending irq %d\n", pending_vec); - if (irqchip_in_kernel(vcpu->kvm)) - kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); From 038f8c110eace38d7598e271835ae96ad04a3a26 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Fri, 4 Feb 2011 10:49:11 +0100 Subject: [PATCH 33/55] KVM: x86: Convert tsc_write_lock to raw_spinlock Code under this lock requires non-preemptibility. Ensure this also over -rt by converting it to raw spinlock. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a58aebef5188..37bd730ff852 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -448,7 +448,7 @@ struct kvm_arch { unsigned long irq_sources_bitmap; s64 kvmclock_offset; - spinlock_t tsc_write_lock; + raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9000829d06cb..17af71da63ad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1017,7 +1017,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) unsigned long flags; s64 sdiff; - spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = data - native_read_tsc(); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1050,7 +1050,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_offset = offset; kvm_x86_ops->write_tsc_offset(vcpu, offset); - spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; @@ -6004,7 +6004,7 @@ int kvm_arch_init_vm(struct kvm *kvm) /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - spin_lock_init(&kvm->arch.tsc_write_lock); + raw_spin_lock_init(&kvm->arch.tsc_write_lock); return 0; } From a3b5ba49a8c58d9a578e016523b047467a41e047 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 11 Feb 2011 14:29:40 +0800 Subject: [PATCH 34/55] KVM: VMX: add the __noclone attribute to vmx_vcpu_run The changelog of 104f226 said "adds the __noclone attribute", but it was missing in its patch. I think it is still needed. Signed-off-by: Lai Jiangshan Acked-by: Andi Kleen Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f76137ce71c6..a61ed93a1f25 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3999,7 +3999,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu) #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); From a8ba6c2622036101d0c6a195f97546bcb1a056ab Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 21 Feb 2011 12:07:58 +0200 Subject: [PATCH 35/55] KVM: VMX: update live TR selector if it changes in real mode When rmode.vm86 is active TR descriptor is updated with vm86 task values, but selector is left intact. vmx_set_segment() makes sure that if TR register is written into while vm86 is active the new values are saved for use after vm86 is deactivated, but since selector is not updated on vm86 activation/deactivation new value is lost. Fix this by writing new selector into vmcs immediately. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a61ed93a1f25..dafb67eddd60 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2138,6 +2138,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, u32 ar; if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; vmx->rmode.tr.base = var->base; vmx->rmode.tr.limit = var->limit; From 93ea5388ead5d7b87f54b8de53e35231acec8bbe Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 21 Feb 2011 12:07:59 +0200 Subject: [PATCH 36/55] KVM: VMX: Initialize vm86 TSS only once. Currently vm86 task is initialized on each real mode entry and vcpu reset. Initialization is done by zeroing TSS and updating relevant fields. But since all vcpus are using the same TSS there is a race where one vcpu may use TSS while other vcpu is initializing it, so the vcpu that uses TSS will see wrong TSS content and will behave incorrectly. Fix that by initializing TSS only once. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index dafb67eddd60..e2b8c6b21ff2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -176,7 +176,6 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); @@ -1802,7 +1801,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) continue_rmode: kvm_mmu_reset_context(vcpu); - init_rmode(vcpu->kvm); } static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2737,22 +2735,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) return 0; } -static int init_rmode(struct kvm *kvm) -{ - int idx, ret = 0; - - idx = srcu_read_lock(&kvm->srcu); - if (!init_rmode_tss(kvm)) - goto exit; - if (!init_rmode_identity_map(kvm)) - goto exit; - - ret = 1; -exit: - srcu_read_unlock(&kvm->srcu, idx); - return ret; -} - static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2760,10 +2742,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - if (!init_rmode(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } vmx->rmode.vm86_active = 0; @@ -3009,6 +2987,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr) if (ret) return ret; kvm->arch.tss_addr = addr; + if (!init_rmode_tss(kvm)) + return -ENOMEM; + return 0; } @@ -4224,8 +4205,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + err = -ENOMEM; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + if (!init_rmode_identity_map(kvm)) + goto free_vmcs; } return &vmx->vcpu; From d170c4190630bcbeb5db266e79ad7a174902e5de Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 21 Feb 2011 11:21:30 +0800 Subject: [PATCH 37/55] KVM: Clear async page fault hash after switching to real mode The hash array of async gfns may still contain some left gfns after kvm_clear_async_pf_completion_queue() called, need to clear them. Signed-off-by: Lai Jiangshan Acked-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 17af71da63ad..dae2d15c49eb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -525,8 +525,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) kvm_x86_ops->set_cr0(vcpu, cr0); - if ((cr0 ^ old_cr0) & X86_CR0_PG) + if ((cr0 ^ old_cr0) & X86_CR0_PG) { kvm_clear_async_pf_completion_queue(vcpu); + kvm_async_pf_hash_reset(vcpu); + } if ((cr0 ^ old_cr0) & update_bits) kvm_mmu_reset_context(vcpu); From 1260edbe7de85ccc58f60040f46034831069bfa2 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 21 Feb 2011 11:51:35 +0800 Subject: [PATCH 38/55] KVM: better readability of efer_reserved_bits use EFER_SCE, EFER_LME and EFER_LMA instead of magic numbers. Signed-off-by: Lai Jiangshan Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dae2d15c49eb..785ae0c300f7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -81,9 +81,10 @@ * - enable LME and LMA per default on 64 bit KVM */ #ifdef CONFIG_X86_64 -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; +static +u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); #else -static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; +static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); #endif #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM From 38a778aa1899b9c6cf3e300a02782c9b78c547ab Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 9 Feb 2011 15:11:28 +0100 Subject: [PATCH 39/55] KVM: Start lock documentation The goal of this document shall be - overview of all locks used in KVM core - provide details on the scope of each lock - explain the lock type, specifically of a raw spin locks - provide a lock ordering guide Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- Documentation/kvm/locking.txt | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 Documentation/kvm/locking.txt diff --git a/Documentation/kvm/locking.txt b/Documentation/kvm/locking.txt new file mode 100644 index 000000000000..3b4cd3bf5631 --- /dev/null +++ b/Documentation/kvm/locking.txt @@ -0,0 +1,25 @@ +KVM Lock Overview +================= + +1. Acquisition Orders +--------------------- + +(to be written) + +2. Reference +------------ + +Name: kvm_lock +Type: raw_spinlock +Arch: any +Protects: - vm_list + - hardware virtualization enable/disable +Comment: 'raw' because hardware enabling/disabling must be atomic /wrt + migration. + +Name: kvm_arch::tsc_write_lock +Type: raw_spinlock +Arch: x86 +Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset} + - tsc offset in vmcb +Comment: 'raw' because updating the tsc offsets must not be preempted. From da8dc75f0c4663a568a0ed4e744c6d55d428d8b6 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Mar 2011 18:56:41 +0800 Subject: [PATCH 40/55] KVM: MMU: fix kvm_mmu_slot_remove_write_access dropping intermediate W bits Only remove write access in the last sptes. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b6a9963400a7..b9bf016d7f0f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3540,12 +3540,17 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - if (sp->role.level != PT_PAGE_TABLE_LEVEL - && is_large_pte(pt[i])) { + if (!is_shadow_present_pte(pt[i]) || + !is_last_spte(pt[i], sp->role.level)) + continue; + + if (is_large_pte(pt[i])) { drop_spte(kvm, &pt[i], shadow_trap_nonpresent_pte); --kvm->stat.lpages; + continue; } + /* avoid RMW */ if (is_writable_pte(pt[i])) update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); From 1b7fd45c32fcc170246bf10ba8d33871840319b8 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Mar 2011 18:58:02 +0800 Subject: [PATCH 41/55] KVM: MMU: set spte accessed bit properly Set spte accessed bit only if guest_initiated == 1 that means the really accessed Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b9bf016d7f0f..d5455b1b725f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3307,11 +3307,11 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; - kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); if (guest_initiated) { + kvm_mmu_access_page(vcpu, gfn); if (gfn == vcpu->arch.last_pt_write_gfn && !last_updated_pte_accessed(vcpu)) { ++vcpu->arch.last_pt_write_count; From 48c0e4e9067eb085e7b300628c9640df071aac55 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Mar 2011 18:59:21 +0800 Subject: [PATCH 42/55] KVM: MMU: move mmu pages calculated out of mmu lock kvm_mmu_calculate_mmu_pages need to walk all memslots and it's protected by kvm->slots_lock, so move it out of mmu spinlock Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 785ae0c300f7..b9c2b8e6c70c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6105,7 +6105,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; + int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT; if (!user_alloc && !old.user_alloc && old.rmap && !npages) { int ret; @@ -6120,12 +6120,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, "failed to munmap memory\n"); } - spin_lock(&kvm->mmu_lock); - if (!kvm->arch.n_requested_mmu_pages) { - unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); - kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); - } + if (!kvm->arch.n_requested_mmu_pages) + nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); + spin_lock(&kvm->mmu_lock); + if (nr_mmu_pages) + kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); kvm_mmu_slot_remove_write_access(kvm, mem->slot); spin_unlock(&kvm->mmu_lock); } From 49b26e26e4b7b94753b39f7edb0c34f3d1c4c167 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Mar 2011 19:00:00 +0800 Subject: [PATCH 43/55] KVM: MMU: do not record gfn in kvm_mmu_pte_write No need to record the gfn to verifier the pte has the same mode as current vcpu, it's because we only speculatively update the pte only if the pte and vcpu have the same mode Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/mmu.c | 6 ++---- arch/x86/kvm/paging_tmpl.h | 2 -- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 37bd730ff852..f08314f303e0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -336,7 +336,6 @@ struct kvm_vcpu_arch { gfn_t last_pte_gfn; struct { - gfn_t gfn; /* presumed gfn during guest pte update */ pfn_t pfn; /* pfn corresponding to that gfn */ unsigned long mmu_seq; } update_pte; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d5455b1b725f..91a194667432 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3228,7 +3228,6 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_release_pfn_clean(pfn); return; } - vcpu->arch.update_pte.gfn = gfn; vcpu->arch.update_pte.pfn = pfn; } @@ -3275,9 +3274,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, /* * Assume that the pte write on a page table of the same type - * as the current vcpu paging mode. This is nearly always true - * (might be false while changing modes). Note it is verified later - * by update_pte(). + * as the current vcpu paging mode since we update the sptes only + * when they have the same mode. */ if ((is_pae(vcpu) && bytes == 4) || !new) { /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6bccc24c4181..b3862eeabb8a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -339,8 +339,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) - return; pfn = vcpu->arch.update_pte.pfn; if (is_error_pfn(pfn)) return; From 842f22ed9b3c545e7c53adbdea0d82efdcd1aa7f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Mar 2011 19:01:10 +0800 Subject: [PATCH 44/55] KVM: MMU: cleanup page alloc and free Using __get_free_page instead of alloc_page and page_address, using free_page instead of __free_page and virt_to_page Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 91a194667432..52645e342db1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -379,15 +379,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, int min) { - struct page *page; + void *page; if (cache->nobjs >= min) return 0; while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = alloc_page(GFP_KERNEL); + page = (void *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; - cache->objects[cache->nobjs++] = page_address(page); + cache->objects[cache->nobjs++] = page; } return 0; } @@ -1032,9 +1032,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) ASSERT(is_empty_shadow_page(sp->spt)); hlist_del(&sp->hash_link); list_del(&sp->link); - __free_page(virt_to_page(sp->spt)); + free_page((unsigned long)sp->spt); if (!sp->role.direct) - __free_page(virt_to_page(sp->gfns)); + free_page((unsigned long)sp->gfns); kmem_cache_free(mmu_page_header_cache, sp); kvm_mod_used_mmu_pages(kvm, -1); } From 676646ee4b0931a1ef54389786983a43eb913a6f Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Fri, 4 Mar 2011 19:01:39 +0800 Subject: [PATCH 45/55] KVM: MMU: remove unused macros These macros are not used, so removed Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ----- arch/x86/kvm/paging_tmpl.h | 3 --- 2 files changed, 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 52645e342db1..88d36890f420 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) -#define PT64_LEVEL_MASK(level) \ - (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level)) - #define PT64_INDEX(address, level)\ (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) @@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644); #define PT32_LEVEL_SHIFT(level) \ (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) -#define PT32_LEVEL_MASK(level) \ - (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level)) #define PT32_LVL_OFFSET_MASK(level) \ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ * PT32_LEVEL_BITS))) - 1)) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b3862eeabb8a..86eb8160bcb9 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -31,7 +31,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 @@ -48,7 +47,6 @@ #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 #define CMPXCHG cmpxchg @@ -827,7 +825,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef PT_LEVEL_MASK #undef PT_LVL_ADDR_MASK #undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS From c8ce057eafd49da6a7afe7791bd84163f65f6132 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 6 Mar 2011 13:03:26 +0200 Subject: [PATCH 46/55] KVM: improve comment on rcu use in irqfd_deassign The RCU use in kvm_irqfd_deassign is tricky: we have rcu_assign_pointer but no synchronize_rcu: synchronize_rcu is done by kvm_irq_routing_update which we share a spinlock with. Fix up a comment in an attempt to make this clearer. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- virt/kvm/eventfd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 2ca4535f4fb7..3656849f78a0 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -313,8 +313,9 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { /* * This rcu_assign_pointer is needed for when - * another thread calls kvm_irqfd_update before - * we flush workqueue below. + * another thread calls kvm_irq_routing_update before + * we flush workqueue below (we synchronize with + * kvm_irq_routing_update using irqfds.lock). * It is paired with synchronize_rcu done by caller * of that function. */ From afc20184b7d24693a934f652e135598a497c9448 Mon Sep 17 00:00:00 2001 From: Takuya Yoshikawa Date: Sat, 5 Mar 2011 12:40:20 +0900 Subject: [PATCH 47/55] KVM: x86: Remove useless regs_page pointer from kvm_lapic Access to this page is mostly done through the regs member which holds the address to this page. The exceptions are in vmx_vcpu_reset() and kvm_free_lapic() and these both can easily be converted to using regs. Signed-off-by: Takuya Yoshikawa Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 9 ++++----- arch/x86/kvm/lapic.h | 1 - arch/x86/kvm/vmx.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0171e66e9994..2b2255b1f04b 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -871,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); - if (vcpu->arch.apic->regs_page) - __free_page(vcpu->arch.apic->regs_page); + if (vcpu->arch.apic->regs) + free_page((unsigned long)vcpu->arch.apic->regs); kfree(vcpu->arch.apic); } @@ -1061,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) vcpu->arch.apic = apic; - apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (apic->regs_page == NULL) { + apic->regs = (void *)get_zeroed_page(GFP_KERNEL); + if (!apic->regs) { printk(KERN_ERR "malloc apic regs error for vcpu %x\n", vcpu->vcpu_id); goto nomem_free_apic; } - apic->regs = page_address(apic->regs_page); apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f5fe32c5edad..52c9e6b9e725 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -13,7 +13,6 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; - struct page *regs_page; void *regs; gpa_t vapic_addr; struct page *vapic_page; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e2b8c6b21ff2..3febb763cb7f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2821,7 +2821,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); if (vm_need_tpr_shadow(vmx->vcpu.kvm)) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - page_to_phys(vmx->vcpu.arch.apic->regs_page)); + __pa(vmx->vcpu.arch.apic->regs)); vmcs_write32(TPR_THRESHOLD, 0); } From 831ca6093ca486060721f5c3c74f97b10f3172b9 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 8 Mar 2011 16:09:51 +0200 Subject: [PATCH 48/55] KVM: SVM: Load %gs earlier if CONFIG_X86_32_LAZY_GS=n With CONFIG_CC_STACKPROTECTOR, we need a valid %gs at all times, so disable lazy reload and do an eager reload immediately after the vmexit. Reported-by: IVAN ANGELOV Acked-By: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8d61df4a02c7..6bb15d583e47 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1155,7 +1155,9 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); load_gs_index(svm->host.gs); #else +#ifdef CONFIG_X86_32_LAZY_GS loadsegment(gs, svm->host.gs); +#endif #endif for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); @@ -3649,6 +3651,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif #endif reload_tss(vcpu); From 5601d05b8c340ee2643febc146099325eff187eb Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 7 Mar 2011 14:55:06 +0200 Subject: [PATCH 49/55] KVM: emulator: Fix io permission checking for 64bit guest Current implementation truncates upper 32bit of TR base address during IO permission bitmap check. The patch fixes this. Reported-and-tested-by: Francis Moreau Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_emulate.h | 4 ++-- arch/x86/kvm/emulate.c | 37 ++++++++++++++++++------------ arch/x86/kvm/x86.c | 15 ++++++++---- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 50ebc327a368..0f5213564326 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -142,9 +142,9 @@ struct x86_emulate_ops { int (*pio_out_emulated)(int size, unsigned short port, const void *val, unsigned int count, struct kvm_vcpu *vcpu); - bool (*get_cached_descriptor)(struct desc_struct *desc, + bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, + void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, int seg, struct kvm_vcpu *vcpu); u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a90d7e033304..d6088b8686fb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -878,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, if (selector & 1 << 2) { struct desc_struct desc; memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) + if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, + ctxt->vcpu)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ @@ -930,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } +/* Does not support long mode */ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, int seg) @@ -1041,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } load: ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); + ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1561,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct desc_struct *ss) { memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); + ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1608,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->regs[VCPU_REGS_RCX] = c->eip; @@ -1680,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.l = 1; } - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); @@ -1737,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); + ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); + ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); c->eip = c->regs[VCPU_REGS_RDX]; @@ -1765,24 +1767,29 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, u16 port, u16 len) { struct desc_struct tr_seg; + u32 base3; int r; u16 io_bitmap_ptr; u8 perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; + unsigned long base; - ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) return false; - r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, - ctxt->vcpu, NULL); + base = get_desc_base(&tr_seg); +#ifdef CONFIG_X86_64 + base |= ((u64)base3) << 32; +#endif + r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, - &perm, 1, ctxt->vcpu, NULL); + r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 1, ctxt->vcpu, + NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -2127,7 +2134,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); if (has_error_code) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9c2b8e6c70c..01f08a65d09b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4162,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg, return get_segment_base(vcpu, seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4176,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, var.limit >>= 12; set_desc_limit(desc, var.limit); set_desc_base(desc, (unsigned long)var.base); +#ifdef CONFIG_X86_64 + if (base3) + *base3 = var.base >> 32; +#endif desc->type = var.type; desc->s = var.s; desc->dpl = var.dpl; @@ -4188,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, - struct kvm_vcpu *vcpu) +static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, + int seg, struct kvm_vcpu *vcpu) { struct kvm_segment var; @@ -4197,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, kvm_get_segment(vcpu, &var, seg); var.base = get_desc_base(desc); +#ifdef CONFIG_X86_64 + var.base |= ((u64)base3) << 32; +#endif var.limit = get_desc_limit(desc); if (desc->g) var.limit = (var.limit << 12) | 0xfff; From 399a40c92d744feebf89b1f07208407c26e0aaf5 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 7 Mar 2011 14:55:07 +0200 Subject: [PATCH 50/55] KVM: emulator: Fix permission checking in io permission bitmap Currently if io port + len crosses 8bit boundary in io permission bitmap the check may allow IO that otherwise should not be allowed. The patch fixes that. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/emulate.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d6088b8686fb..0ad47b819a8b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1769,8 +1769,7 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct desc_struct tr_seg; u32 base3; int r; - u16 io_bitmap_ptr; - u8 perm, bit_idx = port & 0x7; + u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; unsigned long base; @@ -1788,7 +1787,7 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 1, ctxt->vcpu, + r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, NULL); if (r != X86EMUL_CONTINUE) return false; From 1aa8ceef0312a6aae7dd863a120a55f1637b361d Mon Sep 17 00:00:00 2001 From: Nikola Ciprich Date: Wed, 9 Mar 2011 23:36:51 +0100 Subject: [PATCH 51/55] KVM: fix kvmclock regression due to missing clock update commit 387b9f97750444728962b236987fbe8ee8cc4f8c moved kvm_request_guest_time_update(vcpu), breaking 32bit SMP guests using kvm-clock. Fix this by moving (new) clock update function to proper place. Signed-off-by: Nikola Ciprich Acked-by: Zachary Amsden Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01f08a65d09b..f1e4025f1ae2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2127,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (check_tsc_unstable()) { kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); vcpu->arch.tsc_catchup = 1; - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) kvm_migrate_timers(vcpu); vcpu->cpu = cpu; From 40dcaa9f69fa303ddcb5ea65baf4efefed3aec4b Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Mar 2011 15:41:04 +0800 Subject: [PATCH 52/55] KVM: fix rcu usage in init_rmode_* functions fix: [ 3494.671786] stack backtrace: [ 3494.671789] Pid: 10527, comm: qemu-system-x86 Not tainted 2.6.38-rc6+ #23 [ 3494.671790] Call Trace: [ 3494.671796] [] ? lockdep_rcu_dereference+0x9d/0xa5 [ 3494.671826] [] ? kvm_memslots+0x6b/0x73 [kvm] [ 3494.671834] [] ? gfn_to_memslot+0x16/0x4f [kvm] [ 3494.671843] [] ? gfn_to_hva+0x16/0x27 [kvm] [ 3494.671851] [] ? kvm_write_guest_page+0x31/0x83 [kvm] [ 3494.671861] [] ? kvm_clear_guest_page+0x1a/0x1c [kvm] [ 3494.671867] [] ? vmx_set_tss_addr+0x83/0x122 [kvm_intel] and: [ 8328.789599] stack backtrace: [ 8328.789601] Pid: 18736, comm: qemu-system-x86 Not tainted 2.6.38-rc6+ #23 [ 8328.789603] Call Trace: [ 8328.789609] [] ? lockdep_rcu_dereference+0x9d/0xa5 [ 8328.789621] [] ? kvm_memslots+0x6b/0x73 [kvm] [ 8328.789628] [] ? gfn_to_memslot+0x16/0x4f [kvm] [ 8328.789635] [] ? gfn_to_hva+0x16/0x27 [kvm] [ 8328.789643] [] ? kvm_write_guest_page+0x31/0x83 [kvm] [ 8328.789699] [] ? kvm_clear_guest_page+0x1a/0x1c [kvm] [ 8328.789713] [] ? vmx_create_vcpu+0x316/0x3c8 [kvm_intel] Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3febb763cb7f..d8475a26add6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2397,11 +2397,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu) static int init_rmode_tss(struct kvm *kvm) { - gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + gfn_t fn; u16 data = 0; - int ret = 0; - int r; + int r, idx, ret = 0; + idx = srcu_read_lock(&kvm->srcu); + fn = rmode_tss_base(kvm) >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2425,12 +2426,13 @@ static int init_rmode_tss(struct kvm *kvm) ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, r, ret; + int i, idx, r, ret; pfn_t identity_map_pfn; u32 tmp; @@ -2445,6 +2447,7 @@ static int init_rmode_identity_map(struct kvm *kvm) return 1; ret = 0; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2460,6 +2463,7 @@ static int init_rmode_identity_map(struct kvm *kvm) kvm->arch.ept_identity_pagetable_done = true; ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } From 5d163b1c9d6e5562dcdfd6cd471f237f1502bb35 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Mar 2011 15:43:00 +0800 Subject: [PATCH 53/55] KVM: MMU: introduce a common function to get no-dirty-logged slot Cleanup the code of pte_prefetch_gfn_to_memslot and mapping_level_dirty_bitmap Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 88d36890f420..83171fdd38a7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -549,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static struct kvm_memory_slot * +gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, + bool no_dirty_log) { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, large_gfn); - if (slot && slot->dirty_bitmap) - return true; - return false; + + slot = gfn_to_memslot(vcpu->kvm, gfn); + if (!slot || slot->flags & KVM_MEMSLOT_INVALID || + (no_dirty_log && slot->dirty_bitmap)) + slot = NULL; + + return slot; +} + +static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) +{ + return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); } static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -2145,26 +2155,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static struct kvm_memory_slot * -pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) -{ - struct kvm_memory_slot *slot; - - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) - slot = NULL; - - return slot; -} - static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; unsigned long hva; - slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log); if (!slot) { get_page(bad_page); return page_to_pfn(bad_page); @@ -2185,7 +2182,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) + if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) return -1; ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); From 0f53b5b1c0baae4f949ac0721a55b7a2158dda01 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Mar 2011 15:43:51 +0800 Subject: [PATCH 54/55] KVM: MMU: cleanup pte write path This patch does: - call vcpu->arch.mmu.update_pte directly - use gfn_to_pfn_atomic in update_pte path The suggestion is from Avi. Signed-off-by: Xiao Guangrong Signed-off-by: Avi Kivity --- arch/x86/include/asm/kvm_host.h | 7 +--- arch/x86/kvm/mmu.c | 69 +++++++++++---------------------- arch/x86/kvm/paging_tmpl.h | 12 +++--- 3 files changed, 32 insertions(+), 56 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f08314f303e0..c8af0991fdf0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -255,6 +255,8 @@ struct kvm_mmu { int (*sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); + void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + u64 *spte, const void *pte, unsigned long mmu_seq); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -335,11 +337,6 @@ struct kvm_vcpu_arch { u64 *last_pte_updated; gfn_t last_pte_gfn; - struct { - pfn_t pfn; /* pfn corresponding to that gfn */ - unsigned long mmu_seq; - } update_pte; - struct fpu guest_fpu; u64 xcr0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 83171fdd38a7..22fae7593ee7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1204,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +static void nonpaging_update_pte(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp, u64 *spte, + const void *pte, unsigned long mmu_seq) +{ + WARN_ON(1); +} + #define KVM_PAGE_ARRAY_NR 16 struct kvm_mmu_pages { @@ -2796,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2925,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; context->invlpg = paging64_invlpg; + context->update_pte = paging64_update_pte; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -2953,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; + context->update_pte = paging32_update_pte; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -2977,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; + context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; context->direct_map = true; @@ -3081,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) static int init_kvm_mmu(struct kvm_vcpu *vcpu) { - vcpu->arch.update_pte.pfn = bad_pfn; - if (mmu_is_nested(vcpu)) return init_kvm_nested_mmu(vcpu); else if (tdp_enabled) @@ -3156,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *new) + const void *new, unsigned long mmu_seq) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -3164,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - if (!sp->role.cr4_pae) - paging32_update_pte(vcpu, sp, spte, new); - else - paging64_update_pte(vcpu, sp, spte, new); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); } static bool need_remote_flush(u64 old, u64 new) @@ -3202,27 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu) return !!(spte && (*spte & shadow_accessed_mask)); } -static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - u64 gpte) -{ - gfn_t gfn; - pfn_t pfn; - - if (!is_present_gpte(gpte)) - return; - gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - - vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - pfn = gfn_to_pfn(vcpu->kvm, gfn); - - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return; - } - vcpu->arch.update_pte.pfn = pfn; -} - static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) { u64 *spte = vcpu->arch.last_pte_updated; @@ -3244,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - u64 entry, gentry; - u64 *spte; - unsigned offset = offset_in_page(gpa); - unsigned pte_size; - unsigned page_offset; - unsigned misaligned; - unsigned quadrant; - int level; - int flooded = 0; - int npte; - int r; - int invlpg_counter; + unsigned long mmu_seq; + u64 entry, gentry, *spte; + unsigned pte_size, page_offset, misaligned, quadrant, offset; + int level, npte, invlpg_counter, r, flooded = 0; bool remote_flush, local_flush, zap_page; zap_page = remote_flush = local_flush = false; + offset = offset_in_page(gpa); pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); @@ -3293,7 +3271,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_guess_page_from_pte_write(vcpu, gpa, gentry); + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); + spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; @@ -3365,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, + mmu_seq); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; ++spte; @@ -3375,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); spin_unlock(&vcpu->kvm->mmu_lock); - if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { - kvm_release_pfn_clean(vcpu->arch.update_pte.pfn); - vcpu->arch.update_pte.pfn = bad_pfn; - } } int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 86eb8160bcb9..751405097d8c 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -325,7 +325,7 @@ no_present: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte) + u64 *spte, const void *pte, unsigned long mmu_seq) { pt_element_t gpte; unsigned pte_access; @@ -337,12 +337,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - pfn = vcpu->arch.update_pte.pfn; - if (is_error_pfn(pfn)) + pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); + if (is_error_pfn(pfn)) { + kvm_release_pfn_clean(pfn); return; - if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + } + if (mmu_notifier_retry(vcpu, mmu_seq)) return; - kvm_get_pfn(pfn); + /* * we call mmu_set_spte() with host_writable = true beacuse that * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). From 776e58ea3d3735f85678155398241d2513afa67a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Sun, 13 Mar 2011 12:34:27 +0200 Subject: [PATCH 55/55] KVM: unbreak userspace that does not sets tss address Commit 6440e5967bc broke old userspaces that do not set tss address before entering vcpu. Unbreak it by setting tss address to a safe value on the first vcpu entry. New userspaces should set tss address, so print warning in case it doesn't. Signed-off-by: Gleb Natapov Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/vmx.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d8475a26add6..5b4cdcbd154c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -180,6 +180,7 @@ static u64 construct_eptp(unsigned long root_hpa); static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxoff(void); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1762,6 +1763,18 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Call it here with phys address pointing 16M below 4G. + */ + if (!vcpu->kvm->arch.tss_addr) { + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + } + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));