From 70cd4c10b290dd77fff6dc702a9a2c8c679df121 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 27 Feb 2017 11:51:37 +1100 Subject: [PATCH 1/8] KVM: PPC: Book3S HV: Fix software walk of guest process page tables This fixes some bugs in the code that walks the guest's page tables. These bugs cause MMIO emulation to fail whenever the guest is in virtial mode (MMU on), leading to the guest hanging if it tried to access a virtio device. The first bug was that when reading the guest's process table, we were using the whole of arch->process_table, not just the field that contains the process table base address. The second bug was that the mask used when reading the process table entry to get the radix tree base address, RPDB_MASK, had the wrong value. Fixes: 9e04ba69beec ("KVM: PPC: Book3S HV: Add basic infrastructure for radix guests") Fixes: e99833448c5f ("powerpc/mm/radix: Add partition table format & callback") Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/book3s/64/mmu.h | 3 ++- arch/powerpc/kvm/book3s_64_mmu_radix.c | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index d73e9dfa5237..440f3423e213 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -46,7 +46,7 @@ extern struct patb_entry *partition_tb; /* Bits in patb0 field */ #define PATB_HR (1UL << 63) -#define RPDB_MASK 0x0ffffffffffff00fUL +#define RPDB_MASK 0x0fffffffffffff00UL #define RPDB_SHIFT (1UL << 8) #define RTS1_SHIFT 61 /* top 2 bits of radix tree size */ #define RTS1_MASK (3UL << RTS1_SHIFT) @@ -57,6 +57,7 @@ extern struct patb_entry *partition_tb; /* Bits in patb1 field */ #define PATB_GR (1UL << 63) /* guest uses radix; must match HR */ #define PRTS_MASK 0x1f /* process table size field */ +#define PRTB_MASK 0x0ffffffffffff000UL /* * Limit process table to PAGE_SIZE table. This diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 4344651f408c..f6b3e67c5762 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -32,6 +32,7 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, u32 pid; int ret, level, ps; __be64 prte, rpte; + unsigned long ptbl; unsigned long root, pte, index; unsigned long rts, bits, offset; unsigned long gpa; @@ -53,8 +54,8 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return -EINVAL; /* Read partition table to find root of tree for effective PID */ - ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16, - &prte, sizeof(prte)); + ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16); + ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte)); if (ret) return ret; From 4e5acdc23a3dcbd6ad6dc93a9783dd9c838987c8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 28 Feb 2017 11:05:47 +1100 Subject: [PATCH 2/8] KVM: PPC: Book3S HV: Don't use ASDR for real-mode HPT faults on POWER9 In HPT mode on POWER9, the ASDR register is supposed to record segment information for hypervisor page faults. It turns out that POWER9 DD1 does not record the page size information in the ASDR for faults in guest real mode. We have the necessary information in memory already, so by moving the checks for real mode that already existed, we can use the in-memory copy. Since a load is likely to be faster than reading an SPR, we do this unconditionally (not just for POWER9 DD1). Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 47414a6fe2dd..7c6477d1840a 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1787,12 +1787,12 @@ kvmppc_hdsi: /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f BEGIN_FTR_SECTION mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ b 4f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - andi. r0, r11, MSR_DR /* data relocation enabled? */ - beq 3f clrrdi r0, r4, 28 PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ li r0, BOOK3S_INTERRUPT_DATA_SEGMENT @@ -1879,12 +1879,12 @@ kvmppc_hisi: bne .Lradix_hisi /* for radix, just save ASDR */ andis. r0, r11, SRR1_ISI_NOPT@h beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f BEGIN_FTR_SECTION mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ b 4f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - andi. r0, r11, MSR_IR /* instruction relocation enabled? */ - beq 3f clrrdi r0, r10, 28 PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ li r0, BOOK3S_INTERRUPT_INST_SEGMENT From bba82fd75694832b59433bf4e9d7ede8de1dfd42 Mon Sep 17 00:00:00 2001 From: Robert O'Callahan Date: Wed, 1 Feb 2017 17:06:11 +1300 Subject: [PATCH 3/8] KVM: x86: never specify a sample period for virtualized in_tx_cp counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pmc_reprogram_counter() always sets a sample period based on the value of pmc->counter. However, hsw_hw_config() rejects sample periods less than 2^31 - 1. So for example, if a KVM guest does struct perf_event_attr attr; memset(&attr, 0, sizeof(attr)); attr.type = PERF_TYPE_RAW; attr.size = sizeof(attr); attr.config = 0x2005101c4; // conditional branches retired IN_TXCP attr.sample_period = 0; int fd = syscall(__NR_perf_event_open, &attr, 0, -1, -1, 0); ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); the guest kernel counts some conditional branch events, then updates the virtual PMU register with a nonzero count. The host reaches pmc_reprogram_counter() with nonzero pmc->counter, triggers EOPNOTSUPP in hsw_hw_config(), prints "kvm_pmu: event creation failed" in pmc_reprogram_counter(), and silently (from the guest's point of view) stops counting events. We fix event counting by forcing attr.sample_period to always be zero for in_tx_cp counters. Sampling doesn't work, but it already didn't work and can't be fixed without major changes to the approach in hsw_hw_config(). Signed-off-by: Robert O'Callahan Signed-off-by: Radim Krčmář --- arch/x86/kvm/pmu.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 06ce377dcbc9..026db42a86c3 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -113,12 +113,19 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, .config = config, }; + attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + if (in_tx) attr.config |= HSW_IN_TX; - if (in_tx_cp) + if (in_tx_cp) { + /* + * HSW_IN_TX_CHECKPOINTED is not supported with nonzero + * period. Just clear the sample period so at least + * allocating the counter doesn't fail. + */ + attr.sample_period = 0; attr.config |= HSW_IN_TX_CHECKPOINTED; - - attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + } event = perf_event_create_kernel_counter(&attr, -1, current, intr ? kvm_perf_overflow_intr : From e3736c3eb3a6f7c0966923b629c9f92b558aa9c7 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 20 Feb 2017 13:06:21 +0200 Subject: [PATCH 4/8] kvm: convert kvm.users_count from atomic_t to refcount_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This allows to avoid accidental refcounter overflows that might lead to use-after-free situations. Signed-off-by: Elena Reshetova Signed-off-by: Hans Liljestrand Signed-off-by: Kees Cook Signed-off-by: David Windsor Signed-off-by: Radim Krčmář --- include/linux/kvm_host.h | 3 ++- virt/kvm/kvm_main.c | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 8d69d5150748..2c14ad9809da 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -401,7 +402,7 @@ struct kvm { #endif struct kvm_vm_stat stat; struct kvm_arch arch; - atomic_t users_count; + refcount_t users_count; #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cc4d6e0dd2a2..c6b7aff634be 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -617,7 +617,7 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->lock); mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); - atomic_set(&kvm->users_count, 1); + refcount_set(&kvm->users_count, 1); INIT_LIST_HEAD(&kvm->devices); r = kvm_arch_init_vm(kvm, type); @@ -747,13 +747,13 @@ static void kvm_destroy_vm(struct kvm *kvm) void kvm_get_kvm(struct kvm *kvm) { - atomic_inc(&kvm->users_count); + refcount_inc(&kvm->users_count); } EXPORT_SYMBOL_GPL(kvm_get_kvm); void kvm_put_kvm(struct kvm *kvm) { - if (atomic_dec_and_test(&kvm->users_count)) + if (refcount_dec_and_test(&kvm->users_count)) kvm_destroy_vm(kvm); } EXPORT_SYMBOL_GPL(kvm_put_kvm); @@ -3639,7 +3639,7 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, * To avoid the race between open and the removal of the debugfs * directory we test against the users count. */ - if (!atomic_add_unless(&stat_data->kvm->users_count, 1, 0)) + if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) return -ENOENT; if (simple_attr_open(inode, file, get, set, fmt)) { From b7ceaec112aa35aa287325754d8c52b8304892cd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Feb 2017 07:36:16 -0800 Subject: [PATCH 5/8] x86/asm: Tidy up TSS limit code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In an earlier version of the patch ("x86/kvm/vmx: Defer TR reload after VM exit") that introduced TSS limit validity tracking, I confused which helper was which. On reflection, the names I chose sucked. Rename the helpers to make it more obvious what's going on and add some comments. While I'm at it, clear __tss_limit_invalid when force-reloading as well as when contitionally reloading, since any TR reload fixes the limit. Signed-off-by: Andy Lutomirski Signed-off-by: Radim Krčmář --- arch/x86/include/asm/desc.h | 18 +++++++++++------- arch/x86/kernel/ioport.c | 8 +++++++- arch/x86/kernel/process.c | 6 +++--- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index cb8f9149f6c8..1548ca92ad3f 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -205,6 +205,8 @@ static inline void native_load_tr_desc(void) asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); } +DECLARE_PER_CPU(bool, __tss_limit_invalid); + static inline void force_reload_TR(void) { struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); @@ -220,18 +222,20 @@ static inline void force_reload_TR(void) write_gdt_entry(d, GDT_ENTRY_TSS, &tss, DESC_TSS); load_TR_desc(); + this_cpu_write(__tss_limit_invalid, false); } -DECLARE_PER_CPU(bool, need_tr_refresh); - -static inline void refresh_TR(void) +/* + * Call this if you need the TSS limit to be correct, which should be the case + * if and only if you have TIF_IO_BITMAP set or you're switching to a task + * with TIF_IO_BITMAP set. + */ +static inline void refresh_tss_limit(void) { DEBUG_LOCKS_WARN_ON(preemptible()); - if (unlikely(this_cpu_read(need_tr_refresh))) { + if (unlikely(this_cpu_read(__tss_limit_invalid))) force_reload_TR(); - this_cpu_write(need_tr_refresh, false); - } } /* @@ -250,7 +254,7 @@ static inline void invalidate_tss_limit(void) if (unlikely(test_thread_flag(TIF_IO_BITMAP))) force_reload_TR(); else - this_cpu_write(need_tr_refresh, true); + this_cpu_write(__tss_limit_invalid, true); } static inline void native_load_gdt(const struct desc_ptr *dtr) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index b01bc8517450..875d3d25dd6a 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -47,8 +47,14 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) t->io_bitmap_ptr = bitmap; set_thread_flag(TIF_IO_BITMAP); + /* + * Now that we have an IO bitmap, we need our TSS limit to be + * correct. It's fine if we are preempted after doing this: + * with TIF_IO_BITMAP set, context switches will keep our TSS + * limit correct. + */ preempt_disable(); - refresh_TR(); + refresh_tss_limit(); preempt_enable(); } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7780efa635b9..0b302591b51f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -65,8 +65,8 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); -DEFINE_PER_CPU(bool, need_tr_refresh); -EXPORT_PER_CPU_SYMBOL_GPL(need_tr_refresh); +DEFINE_PER_CPU(bool, __tss_limit_invalid); +EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); /* * this gets called so that we can store lazy state into memory and copy the @@ -218,7 +218,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, * Make sure that the TSS limit is correct for the CPU * to notice the IO bitmap. */ - refresh_TR(); + refresh_tss_limit(); } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { /* * Clear any possible leftover bits: From 0eb1d0fa6a68324b51db4f7ae16d9b042c5b2de4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Feb 2017 07:36:17 -0800 Subject: [PATCH 6/8] selftests/x86: Add a basic selftest for ioperm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This doesn't fully exercise the interaction between KVM and ioperm(), but it does test basic functionality. Signed-off-by: Andy Lutomirski Signed-off-by: Radim Krčmář --- tools/testing/selftests/x86/Makefile | 2 +- tools/testing/selftests/x86/ioperm.c | 170 +++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/x86/ioperm.c diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 83d8b1c6cb0e..bed3e6086cb1 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,7 +5,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ - check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test \ + check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test ioperm \ protection_keys test_vdso TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/ioperm.c b/tools/testing/selftests/x86/ioperm.c new file mode 100644 index 000000000000..b77313ba2ab1 --- /dev/null +++ b/tools/testing/selftests/x86/ioperm.c @@ -0,0 +1,170 @@ +/* + * ioperm.c - Test case for ioperm(2) + * Copyright (c) 2015 Andrew Lutomirski + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int nerrs = 0; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); + +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static jmp_buf jmpbuf; + +static void sigsegv(int sig, siginfo_t *si, void *ctx_void) +{ + siglongjmp(jmpbuf, 1); +} + +static bool try_outb(unsigned short port) +{ + sethandler(SIGSEGV, sigsegv, SA_RESETHAND); + if (sigsetjmp(jmpbuf, 1) != 0) { + return false; + } else { + asm volatile ("outb %%al, %w[port]" + : : [port] "Nd" (port), "a" (0)); + return true; + } + clearhandler(SIGSEGV); +} + +static void expect_ok(unsigned short port) +{ + if (!try_outb(port)) { + printf("[FAIL]\toutb to 0x%02hx failed\n", port); + exit(1); + } + + printf("[OK]\toutb to 0x%02hx worked\n", port); +} + +static void expect_gp(unsigned short port) +{ + if (try_outb(port)) { + printf("[FAIL]\toutb to 0x%02hx worked\n", port); + exit(1); + } + + printf("[OK]\toutb to 0x%02hx failed\n", port); +} + +int main(void) +{ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + err(1, "sched_setaffinity to CPU 0"); + + expect_gp(0x80); + expect_gp(0xed); + + /* + * Probe for ioperm support. Note that clearing ioperm bits + * works even as nonroot. + */ + printf("[RUN]\tenable 0x80\n"); + if (ioperm(0x80, 1, 1) != 0) { + printf("[OK]\tioperm(0x80, 1, 1) failed (%d) -- try running as root\n", + errno); + return 0; + } + expect_ok(0x80); + expect_gp(0xed); + + printf("[RUN]\tdisable 0x80\n"); + if (ioperm(0x80, 1, 0) != 0) { + printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno); + return 1; + } + expect_gp(0x80); + expect_gp(0xed); + + /* Make sure that fork() preserves ioperm. */ + if (ioperm(0x80, 1, 1) != 0) { + printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno); + return 1; + } + + pid_t child = fork(); + if (child == -1) + err(1, "fork"); + + if (child == 0) { + printf("[RUN]\tchild: check that we inherited permissions\n"); + expect_ok(0x80); + expect_gp(0xed); + return 0; + } else { + int status; + if (waitpid(child, &status, 0) != child || + !WIFEXITED(status)) { + printf("[FAIL]\tChild died\n"); + nerrs++; + } else if (WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild succeeded\n"); + } + } + + /* Test the capability checks. */ + + printf("\tDrop privileges\n"); + if (setresuid(1, 1, 1) != 0) { + printf("[WARN]\tDropping privileges failed\n"); + return 0; + } + + printf("[RUN]\tdisable 0x80\n"); + if (ioperm(0x80, 1, 0) != 0) { + printf("[FAIL]\tioperm(0x80, 1, 0) failed (%d)", errno); + return 1; + } + printf("[OK]\tit worked\n"); + + printf("[RUN]\tenable 0x80 again\n"); + if (ioperm(0x80, 1, 1) == 0) { + printf("[FAIL]\tit succeeded but should have failed.\n"); + return 1; + } + printf("[OK]\tit failed\n"); + return 0; +} From 0fce546f9f07b94ccc9de09cf48d35e18946d2fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=A9my=20Lefaure?= Date: Sat, 25 Feb 2017 17:46:53 -0500 Subject: [PATCH 7/8] x86/kvm/vmx: remove unused variable in segment_base() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pointer 'struct desc_struct *d' is unused since commit 8c2e41f7ae12 ("x86/kvm/vmx: Simplify segment_base()") so let's remove it. Signed-off-by: Jérémy Lefaure Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef4ba71dbb66..764f1f897847 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2053,7 +2053,6 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) static unsigned long segment_base(u16 selector) { struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); - struct desc_struct *d; struct desc_struct *table; unsigned long v; From acc9ab60132739e0c5ab40644444cd7b22d12886 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 27 Feb 2017 04:24:39 -0800 Subject: [PATCH 8/8] KVM: nVMX: Fix pending events injection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L2 fails to boot on a non-APICv box dues to 'commit 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running")' KVM internal error. Suberror: 3 extra data[0]: 800000ef extra data[1]: 1 RAX=0000000000000000 RBX=ffffffff81f36140 RCX=0000000000000000 RDX=0000000000000000 RSI=0000000000000000 RDI=0000000000000000 RBP=ffff88007c92fe90 RSP=ffff88007c92fe90 R8 =ffff88007fccdca0 R9 =0000000000000000 R10=00000000fffedb3d R11=0000000000000000 R12=0000000000000003 R13=0000000000000000 R14=0000000000000000 R15=ffff88007c92c000 RIP=ffffffff810645e6 RFL=00000246 [---Z-P-] CPL=0 II=0 A20=1 SMM=0 HLT=0 ES =0000 0000000000000000 ffffffff 00c00000 CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA] SS =0000 0000000000000000 ffffffff 00c00000 DS =0000 0000000000000000 ffffffff 00c00000 FS =0000 0000000000000000 ffffffff 00c00000 GS =0000 ffff88007fcc0000 ffffffff 00c00000 LDT=0000 0000000000000000 ffffffff 00c00000 TR =0040 ffff88007fcd4200 00002087 00008b00 DPL=0 TSS64-busy GDT= ffff88007fcc9000 0000007f IDT= ffffffffff578000 00000fff CR0=80050033 CR2=00000000ffffffff CR3=0000000001e0a000 CR4=003406e0 DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000 DR6=00000000fffe0ff0 DR7=0000000000000400 EFER=0000000000000d01 We should try to reinject previous events if any before trying to inject new event if pending. If vmexit is triggered by L2 guest and L0 interested in, we should reinject IDT-vectoring info to L2 through vmcs02 if any, otherwise, we can consider new IRQs/NMIs which can be injected and call nested events callback to switch from L2 to L1 if needed and inject the proper vmexit events. However, 'commit 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running")' results in the handle events order reversely on non-APICv box. This patch fixes it by bailing out for pending events and not consider new events in this scenario. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jan Kiszka Signed-off-by: Wanpeng Li Fixes: 0ad3bed6c5ec ("kvm: nVMX: move nested events check to kvm_vcpu_running") Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 764f1f897847..283aa8601833 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10641,6 +10641,11 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vcpu->arch.exception.pending || + vcpu->arch.nmi_injected || + vcpu->arch.interrupt.pending) + return -EBUSY; + if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && vmx->nested.preemption_timer_expired) { if (vmx->nested.nested_run_pending) @@ -10650,8 +10655,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) } if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { - if (vmx->nested.nested_run_pending || - vcpu->arch.interrupt.pending) + if (vmx->nested.nested_run_pending) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, NMI_VECTOR | INTR_TYPE_NMI_INTR |