diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 254bfafdcbcd..ef66b3c45ba2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5804,8 +5804,9 @@ panic() code such as dumping handler. xen_nopvspin [X86,XEN] - Disables the ticketlock slowpath using Xen PV - optimizations. + Disables the qspinlock slowpath using Xen PV optimizations. + This parameter is obsoleted by "nopvspin" parameter, which + has equivalent effect for XEN platform. xen_nopv [X86] Disables the PV optimizations forcing the HVM guest to @@ -5831,6 +5832,11 @@ as generic guest with no PV drivers. Currently support XEN HVM, KVM, HYPER_V and VMWARE guest. + nopvspin [X86,XEN,KVM] + Disables the qspinlock slow path using PV optimizations + which allow the hypervisor to 'idle' the guest on lock + contention. + xirc2ps_cs= [NET,PCMCIA] Format: ,,,,,[,[,[,]]] diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index fe05201e17be..cdfd98155311 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -669,6 +669,10 @@ MSRs that have been set successfully. Defines the vcpu responses to the cpuid instruction. Applications should use the KVM_SET_CPUID2 ioctl if available. +Note, when this IOCTL fails, KVM gives no guarantees that previous valid CPUID +configuration (if there is) is not corrupted. Userspace can get a copy of the +resulting CPUID configuration through KVM_GET_CPUID2 in case. + :: struct kvm_cpuid_entry { @@ -4795,6 +4799,7 @@ hardware_exit_reason. /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; /* if KVM_LAST_CPU */ } fail_entry; If exit_reason is KVM_EXIT_FAIL_ENTRY, the vcpu could not be run due diff --git a/arch/arm64/include/asm/kvm_coproc.h b/arch/arm64/include/asm/kvm_coproc.h index 0185ee8b8b5e..454373704b8a 100644 --- a/arch/arm64/include/asm/kvm_coproc.h +++ b/arch/arm64/include/asm/kvm_coproc.h @@ -27,12 +27,12 @@ struct kvm_sys_reg_target_table { void kvm_register_target_sys_reg_table(unsigned int target, struct kvm_sys_reg_target_table *table); -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu); +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu); +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu); +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu); +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu); +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu); #define kvm_coproc_table_init kvm_sys_reg_table_init void kvm_sys_reg_table_init(void); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e21d4a01372f..f81151ad3d3c 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -97,17 +97,6 @@ struct kvm_arch { bool return_nisv_io_abort_to_user; }; -#define KVM_NR_MEM_OBJS 40 - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - struct kvm_vcpu_fault_info { u32 esr_el2; /* Hyp Syndrom Register */ u64 far_el2; /* Hyp Fault Address Register */ @@ -486,18 +475,15 @@ u64 __kvm_call_hyp(void *hypfn, ...); void force_vm_exit(const cpumask_t *mask); void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index); -void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index); +int handle_exit(struct kvm_vcpu *vcpu, int exception_index); +void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); /* MMIO helpers */ void kvm_mmio_write_buf(void *buf, unsigned int len, unsigned long data); unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len); -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run); -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa); +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu); +int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa); int kvm_perf_init(void); int kvm_perf_teardown(void); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b12bfc1f051a..40be8f6c7351 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -139,7 +139,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm); int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t pa, unsigned long size, bool writable); -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu); void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_types.h b/arch/arm64/include/asm/kvm_types.h new file mode 100644 index 000000000000..9a126b9e2d7c --- /dev/null +++ b/arch/arm64/include/asm/kvm_types.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ARM64_KVM_TYPES_H +#define _ASM_ARM64_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_ARM64_KVM_TYPES_H */ + diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 90cb90561446..73e12869afe3 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -270,6 +270,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.target = -1; bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES); + vcpu->arch.mmu_page_cache.gfp_zero = __GFP_ZERO; + /* Set up the timer */ kvm_timer_vcpu_init(vcpu); @@ -658,7 +660,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) return ret; if (run->exit_reason == KVM_EXIT_MMIO) { - ret = kvm_handle_mmio_return(vcpu, run); + ret = kvm_handle_mmio_return(vcpu); if (ret) return ret; } @@ -810,11 +812,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); /* Exit types that need handling before we can be preempted */ - handle_exit_early(vcpu, run, ret); + handle_exit_early(vcpu, ret); preempt_enable(); - ret = handle_exit(vcpu, run, ret); + ret = handle_exit(vcpu, ret); } /* Tell userspace about in-kernel device output levels */ diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 5a02d4c90559..1df3beafd73f 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -25,7 +25,7 @@ #define CREATE_TRACE_POINTS #include "trace_handle_exit.h" -typedef int (*exit_handle_fn)(struct kvm_vcpu *, struct kvm_run *); +typedef int (*exit_handle_fn)(struct kvm_vcpu *); static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr) { @@ -33,7 +33,7 @@ static void kvm_handle_guest_serror(struct kvm_vcpu *vcpu, u32 esr) kvm_inject_vabt(vcpu); } -static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_hvc(struct kvm_vcpu *vcpu) { int ret; @@ -50,7 +50,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; } -static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_smc(struct kvm_vcpu *vcpu) { /* * "If an SMC instruction executed at Non-secure EL1 is @@ -69,7 +69,7 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) * Guest access to FP/ASIMD registers are routed to this handler only * when the system doesn't support FP/ASIMD. */ -static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_no_fpsimd(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; @@ -87,7 +87,7 @@ static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) * world-switches and schedule other host processes until there is an * incoming IRQ or FIQ to the VM. */ -static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_wfx(struct kvm_vcpu *vcpu) { if (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_WFx_ISS_WFE) { trace_kvm_wfx_arm64(*vcpu_pc(vcpu), true); @@ -109,16 +109,16 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run) * kvm_handle_guest_debug - handle a debug exception instruction * * @vcpu: the vcpu pointer - * @run: access to the kvm_run structure for results * * We route all debug exceptions through the same handler. If both the * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * - * @return: 0 (while setting run->exit_reason), -1 for error + * @return: 0 (while setting vcpu->run->exit_reason), -1 for error */ -static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; u32 hsr = kvm_vcpu_get_hsr(vcpu); int ret = 0; @@ -144,7 +144,7 @@ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run) return ret; } -static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) { u32 hsr = kvm_vcpu_get_hsr(vcpu); @@ -155,7 +155,7 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } -static int handle_sve(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_sve(struct kvm_vcpu *vcpu) { /* Until SVE is supported for guests: */ kvm_inject_undefined(vcpu); @@ -167,7 +167,7 @@ static int handle_sve(struct kvm_vcpu *vcpu, struct kvm_run *run) * a NOP). If we get here, it is that we didn't fixup ptrauth on exit, and all * that we can do is give the guest an UNDEF. */ -static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int kvm_handle_ptrauth(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; @@ -212,7 +212,7 @@ static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) * KVM_EXIT_DEBUG, otherwise userspace needs to complete its * emulation first. */ -static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) +static int handle_trap_exceptions(struct kvm_vcpu *vcpu) { int handled; @@ -227,7 +227,7 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) exit_handle_fn exit_handler; exit_handler = kvm_get_exit_handler(vcpu); - handled = exit_handler(vcpu, run); + handled = exit_handler(vcpu); } return handled; @@ -237,9 +237,10 @@ static int handle_trap_exceptions(struct kvm_vcpu *vcpu, struct kvm_run *run) * Return > 0 to return to guest, < 0 on error, 0 (and set exit_reason) on * proper exit to userspace. */ -int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) +int handle_exit(struct kvm_vcpu *vcpu, int exception_index) { + struct kvm_run *run = vcpu->run; + if (ARM_SERROR_PENDING(exception_index)) { u8 hsr_ec = ESR_ELx_EC(kvm_vcpu_get_hsr(vcpu)); @@ -265,7 +266,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_EL1_SERROR: return 1; case ARM_EXCEPTION_TRAP: - return handle_trap_exceptions(vcpu, run); + return handle_trap_exceptions(vcpu); case ARM_EXCEPTION_HYP_GONE: /* * EL2 has been reset to the hyp-stub. This happens when a guest @@ -289,8 +290,7 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, } /* For exit types that need handling before we can be preempted */ -void handle_exit_early(struct kvm_vcpu *vcpu, struct kvm_run *run, - int exception_index) +void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) { if (ARM_SERROR_PENDING(exception_index)) { if (this_cpu_has_cap(ARM64_HAS_RAS_EXTN)) { diff --git a/arch/arm64/kvm/mmio.c b/arch/arm64/kvm/mmio.c index 4e0366759726..158fbe682611 100644 --- a/arch/arm64/kvm/mmio.c +++ b/arch/arm64/kvm/mmio.c @@ -77,9 +77,8 @@ unsigned long kvm_mmio_read_buf(const void *buf, unsigned int len) * or in-kernel IO emulation * * @vcpu: The VCPU pointer - * @run: The VCPU run struct containing the mmio data */ -int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_mmio_return(struct kvm_vcpu *vcpu) { unsigned long data; unsigned int len; @@ -92,6 +91,8 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) vcpu->mmio_needed = 0; if (!kvm_vcpu_dabt_iswrite(vcpu)) { + struct kvm_run *run = vcpu->run; + len = kvm_vcpu_dabt_get_as(vcpu); data = kvm_mmio_read_buf(run->mmio.data, len); @@ -119,9 +120,9 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run) return 0; } -int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, - phys_addr_t fault_ipa) +int io_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa) { + struct kvm_run *run = vcpu->run; unsigned long data; unsigned long rt; int ret; @@ -188,7 +189,7 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run, if (!is_write) memcpy(run->mmio.data, data_buf, len); vcpu->stat.mmio_exit_kernel++; - kvm_handle_mmio_return(vcpu, run); + kvm_handle_mmio_return(vcpu); return 1; } diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 31058e6e7c2a..7a7ddc4558a7 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -124,38 +124,6 @@ static void stage2_dissolve_pud(struct kvm *kvm, phys_addr_t addr, pud_t *pudp) put_page(virt_to_page(pudp)); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - int min, int max) -{ - void *page; - - BUG_ON(max > KVM_NR_MEM_OBJS); - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < max) { - page = (void *)__get_free_page(GFP_PGTABLE_USER); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc || !mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - static void clear_stage2_pgd_entry(struct kvm *kvm, pgd_t *pgd, phys_addr_t addr) { p4d_t *p4d_table __maybe_unused = stage2_p4d_offset(kvm, pgd, 0UL); @@ -1132,7 +1100,7 @@ static p4d_t *stage2_get_p4d(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (stage2_pgd_none(kvm, *pgd)) { if (!cache) return NULL; - p4d = mmu_memory_cache_alloc(cache); + p4d = kvm_mmu_memory_cache_alloc(cache); stage2_pgd_populate(kvm, pgd, p4d); get_page(virt_to_page(pgd)); } @@ -1150,7 +1118,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (stage2_p4d_none(kvm, *p4d)) { if (!cache) return NULL; - pud = mmu_memory_cache_alloc(cache); + pud = kvm_mmu_memory_cache_alloc(cache); stage2_p4d_populate(kvm, p4d, pud); get_page(virt_to_page(p4d)); } @@ -1171,7 +1139,7 @@ static pmd_t *stage2_get_pmd(struct kvm *kvm, struct kvm_mmu_memory_cache *cache if (stage2_pud_none(kvm, *pud)) { if (!cache) return NULL; - pmd = mmu_memory_cache_alloc(cache); + pmd = kvm_mmu_memory_cache_alloc(cache); stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } @@ -1377,7 +1345,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (stage2_pud_none(kvm, *pud)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ - pmd = mmu_memory_cache_alloc(cache); + pmd = kvm_mmu_memory_cache_alloc(cache); stage2_pud_populate(kvm, pud, pmd); get_page(virt_to_page(pud)); } @@ -1402,7 +1370,7 @@ static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, if (pmd_none(*pmd)) { if (!cache) return 0; /* ignore calls from kvm_set_spte_hva */ - pte = mmu_memory_cache_alloc(cache); + pte = kvm_mmu_memory_cache_alloc(cache); kvm_pmd_populate(pmd, pte); get_page(virt_to_page(pmd)); } @@ -1469,7 +1437,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, phys_addr_t addr, end; int ret = 0; unsigned long pfn; - struct kvm_mmu_memory_cache cache = { 0, }; + struct kvm_mmu_memory_cache cache = { 0, __GFP_ZERO, NULL, }; end = (guest_ipa + size + PAGE_SIZE - 1) & PAGE_MASK; pfn = __phys_to_pfn(pa); @@ -1480,9 +1448,8 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, if (writable) pte = kvm_s2pte_mkwrite(pte); - ret = mmu_topup_memory_cache(&cache, - kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); + ret = kvm_mmu_topup_memory_cache(&cache, + kvm_mmu_cache_min_pages(kvm)); if (ret) goto out; spin_lock(&kvm->mmu_lock); @@ -1496,7 +1463,7 @@ int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa, } out: - mmu_free_memory_cache(&cache); + kvm_mmu_free_memory_cache(&cache); return ret; } @@ -1882,8 +1849,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, mmap_read_unlock(current->mm); /* We need minimum second+third level pages */ - ret = mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm), - KVM_NR_MEM_OBJS); + ret = kvm_mmu_topup_memory_cache(memcache, kvm_mmu_cache_min_pages(kvm)); if (ret) return ret; @@ -2050,7 +2016,6 @@ out: /** * kvm_handle_guest_abort - handles all 2nd stage aborts * @vcpu: the VCPU pointer - * @run: the kvm_run structure * * Any abort that gets to the host is almost guaranteed to be caused by a * missing second stage translation table entry, which can mean that either the @@ -2059,7 +2024,7 @@ out: * space. The distinction is based on the IPA causing the fault and whether this * memory region has been registered as standard RAM by user space. */ -int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_guest_abort(struct kvm_vcpu *vcpu) { unsigned long fault_status; phys_addr_t fault_ipa; @@ -2138,7 +2103,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) * of the page size. */ fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1); - ret = io_mem_abort(vcpu, run, fault_ipa); + ret = io_mem_abort(vcpu, fault_ipa); goto out_unlock; } @@ -2307,7 +2272,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } phys_addr_t kvm_mmu_get_httbr(void) diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index d3196671c590..138961d7ebe3 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -2156,7 +2156,7 @@ static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); } -int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); return 1; @@ -2335,7 +2335,7 @@ static int kvm_handle_cp_32(struct kvm_vcpu *vcpu, return 1; } -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu) { const struct sys_reg_desc *target_specific; size_t num; @@ -2346,7 +2346,7 @@ int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) target_specific, num); } -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu) { const struct sys_reg_desc *target_specific; size_t num; @@ -2357,14 +2357,14 @@ int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) target_specific, num); } -int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu) { return kvm_handle_cp_64(vcpu, cp14_64_regs, ARRAY_SIZE(cp14_64_regs), NULL, 0); } -int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu) { return kvm_handle_cp_32(vcpu, cp14_regs, ARRAY_SIZE(cp14_regs), @@ -2416,9 +2416,8 @@ static void reset_sys_reg_descs(struct kvm_vcpu *vcpu, /** * kvm_handle_sys_reg -- handles a mrs/msr trap on a guest sys_reg access * @vcpu: The VCPU pointer - * @run: The kvm_run struct */ -int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run) +int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) { struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_hsr(vcpu); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index dd36cba078bc..c95fa3a2484c 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2182,6 +2182,7 @@ endchoice config KVM_GUEST bool "KVM Guest Kernel" + depends on CPU_MIPS32_R2 depends on BROKEN_ON_SMP help Select this option if building a guest kernel for KVM (Trap & Emulate) diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 363e7a89d173..d35eaed1668f 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -335,17 +335,6 @@ struct kvm_mips_tlb { long tlb_lo[2]; }; -#define KVM_NR_MEM_OBJS 4 - -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - #define KVM_MIPS_AUX_FPU 0x1 #define KVM_MIPS_AUX_MSA 0x2 @@ -854,8 +843,8 @@ struct kvm_mips_callbacks { const struct kvm_one_reg *reg, s64 v); int (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); int (*vcpu_put)(struct kvm_vcpu *vcpu, int cpu); - int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); - void (*vcpu_reenter)(struct kvm_run *run, struct kvm_vcpu *vcpu); + int (*vcpu_run)(struct kvm_vcpu *vcpu); + void (*vcpu_reenter)(struct kvm_vcpu *vcpu); }; extern struct kvm_mips_callbacks *kvm_mips_callbacks; int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks); @@ -910,7 +899,6 @@ extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu, extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu, bool write_fault); @@ -1021,83 +1009,67 @@ static inline bool kvm_is_ifetch_fault(struct kvm_vcpu_arch *vcpu) extern enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); -extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, - struct kvm_run *run); +extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu); u32 kvm_mips_read_count(struct kvm_vcpu *vcpu); void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count); @@ -1126,26 +1098,21 @@ static inline void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) {} enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu); /* COP0 */ diff --git a/arch/mips/include/asm/kvm_types.h b/arch/mips/include/asm/kvm_types.h new file mode 100644 index 000000000000..213754d9ef6b --- /dev/null +++ b/arch/mips/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_MIPS_KVM_TYPES_H +#define _ASM_MIPS_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 4 + +#endif /* _ASM_MIPS_KVM_TYPES_H */ diff --git a/arch/mips/kvm/00README.txt b/arch/mips/kvm/00README.txt deleted file mode 100644 index 51617e481aa3..000000000000 --- a/arch/mips/kvm/00README.txt +++ /dev/null @@ -1,31 +0,0 @@ -KVM/MIPS Trap & Emulate Release Notes -===================================== - -(1) KVM/MIPS should support MIPS32R2 and beyond. It has been tested on the following platforms: - Malta Board with FPGA based 34K - Sigma Designs TangoX board with a 24K based 8654 SoC. - Malta Board with 74K @ 1GHz - -(2) Both Guest kernel and Guest Userspace execute in UM. - Guest User address space: 0x00000000 -> 0x40000000 - Guest Kernel Unmapped: 0x40000000 -> 0x60000000 - Guest Kernel Mapped: 0x60000000 -> 0x80000000 - - Guest Usermode virtual memory is limited to 1GB. - -(2) 16K Page Sizes: Both Host Kernel and Guest Kernel should have the same page size, currently at least 16K. - Note that due to cache aliasing issues, 4K page sizes are NOT supported. - -(3) No HugeTLB Support - Both the host kernel and Guest kernel should have the page size set to 16K. - This will be implemented in a future release. - -(4) KVM/MIPS does not have support for SMP Guests - Linux-3.7-rc2 based SMP guest hangs due to the following code sequence in the generated TLB handlers: - LL/TLBP/SC. Since the TLBP instruction causes a trap the reservation gets cleared - when we ERET back to the guest. This causes the guest to hang in an infinite loop. - This will be fixed in a future release. - -(5) Use Host FPU - Currently KVM/MIPS emulates a 24K CPU without a FPU. - This will be fixed in a future release diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 2bf02d849a3a..032b3fca6cbb 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -37,10 +37,11 @@ choice config KVM_MIPS_TE bool "Trap & Emulate" + depends on CPU_MIPS32_R2 help Use trap and emulate to virtualize 32-bit guests in user mode. This does not require any special hardware Virtualization support beyond - standard MIPS32/64 r2 or later, but it does require the guest kernel + standard MIPS32 r2 or later, but it does require the guest kernel to be configured with CONFIG_KVM_GUEST=y so that it resides in the user address segment. diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index d242300cacc0..703782355318 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -1262,7 +1262,6 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu) enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -1597,12 +1596,12 @@ dont_update_pc: enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { int r; enum emulation_result er; u32 rt; + struct kvm_run *run = vcpu->run; void *data = run->mmio.data; unsigned int imme; unsigned long curr_pc; @@ -1863,7 +1862,7 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, vcpu->arch.gprs[rt], *(u64 *)data); break; default: - kvm_err("Godson Exteneded GS-Store not yet supported (inst=0x%08x)\n", + kvm_err("Godson Extended GS-Store not yet supported (inst=0x%08x)\n", inst.word); break; } @@ -1896,9 +1895,9 @@ out_fail: } enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, - u32 cause, struct kvm_run *run, - struct kvm_vcpu *vcpu) + u32 cause, struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; int r; enum emulation_result er; unsigned long curr_pc; @@ -2107,7 +2106,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, vcpu->mmio_needed = 30; /* signed */ break; default: - kvm_err("Godson Exteneded GS-Load for float not yet supported (inst=0x%08x)\n", + kvm_err("Godson Extended GS-Load for float not yet supported (inst=0x%08x)\n", inst.word); break; } @@ -2128,7 +2127,7 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, run->mmio.phys_addr, run->mmio.len, run->mmio.data); if (!r) { - kvm_mips_complete_mmio_load(vcpu, run); + kvm_mips_complete_mmio_load(vcpu); vcpu->mmio_needed = 0; return EMULATE_DONE; } @@ -2140,7 +2139,6 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), unsigned long curr_pc, unsigned long addr, - struct kvm_run *run, struct kvm_vcpu *vcpu, u32 cause) { @@ -2168,13 +2166,13 @@ static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), /* no matching guest TLB */ vcpu->arch.host_cp0_badvaddr = addr; vcpu->arch.pc = curr_pc; - kvm_mips_emulate_tlbmiss_ld(cause, NULL, run, vcpu); + kvm_mips_emulate_tlbmiss_ld(cause, NULL, vcpu); return EMULATE_EXCEPT; case KVM_MIPS_TLBINV: /* invalid matching guest TLB */ vcpu->arch.host_cp0_badvaddr = addr; vcpu->arch.pc = curr_pc; - kvm_mips_emulate_tlbinv_ld(cause, NULL, run, vcpu); + kvm_mips_emulate_tlbinv_ld(cause, NULL, vcpu); return EMULATE_EXCEPT; default: break; @@ -2184,7 +2182,6 @@ static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; @@ -2274,7 +2271,7 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, * guest's behalf. */ er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, - curr_pc, va, run, vcpu, cause); + curr_pc, va, vcpu, cause); if (er != EMULATE_DONE) goto done; #ifdef CONFIG_KVM_MIPS_DYN_TRANS @@ -2287,11 +2284,11 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, } else if (op_inst == Hit_Invalidate_I) { /* Perform the icache synchronisation on the guest's behalf */ er = kvm_mips_guest_cache_op(protected_writeback_dcache_line, - curr_pc, va, run, vcpu, cause); + curr_pc, va, vcpu, cause); if (er != EMULATE_DONE) goto done; er = kvm_mips_guest_cache_op(protected_flush_icache_line, - curr_pc, va, run, vcpu, cause); + curr_pc, va, vcpu, cause); if (er != EMULATE_DONE) goto done; @@ -2317,7 +2314,6 @@ done: } enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { union mips_instruction inst; @@ -2333,14 +2329,14 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, switch (inst.r_format.opcode) { case cop0_op: - er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu); + er = kvm_mips_emulate_CP0(inst, opc, cause, vcpu); break; #ifndef CONFIG_CPU_MIPSR6 case cache_op: ++vcpu->stat.cache_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu); + er = kvm_mips_emulate_cache(inst, opc, cause, vcpu); break; #else case spec3_op: @@ -2348,7 +2344,7 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, case cache6_op: ++vcpu->stat.cache_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_mips_emulate_cache(inst, opc, cause, run, + er = kvm_mips_emulate_cache(inst, opc, cause, vcpu); break; default: @@ -2388,7 +2384,6 @@ long kvm_mips_guest_exception_base(struct kvm_vcpu *vcpu) enum emulation_result kvm_mips_emulate_syscall(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2423,7 +2418,6 @@ enum emulation_result kvm_mips_emulate_syscall(u32 cause, enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2467,7 +2461,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause, enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2509,7 +2502,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause, enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2551,7 +2543,6 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause, enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2592,7 +2583,6 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause, enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2632,7 +2622,6 @@ enum emulation_result kvm_mips_emulate_tlbmod(u32 cause, enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2661,7 +2650,6 @@ enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause, enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2696,7 +2684,6 @@ enum emulation_result kvm_mips_emulate_ri_exc(u32 cause, enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2731,7 +2718,6 @@ enum emulation_result kvm_mips_emulate_bp_exc(u32 cause, enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2766,7 +2752,6 @@ enum emulation_result kvm_mips_emulate_trap_exc(u32 cause, enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2801,7 +2786,6 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause, enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2836,7 +2820,6 @@ enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause, enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2870,7 +2853,6 @@ enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause, } enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -2959,12 +2941,12 @@ emulate_ri: * branch target), and pass the RI exception to the guest OS. */ vcpu->arch.pc = curr_pc; - return kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); + return kvm_mips_emulate_ri_exc(cause, opc, vcpu); } -enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, - struct kvm_run *run) +enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu) { + struct kvm_run *run = vcpu->run; unsigned long *gpr = &vcpu->arch.gprs[vcpu->arch.io_gpr]; enum emulation_result er = EMULATE_DONE; @@ -3107,7 +3089,6 @@ done: static enum emulation_result kvm_mips_emulate_exc(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f; @@ -3145,7 +3126,6 @@ static enum emulation_result kvm_mips_emulate_exc(u32 cause, enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; @@ -3227,7 +3207,7 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, } if (er == EMULATE_PRIV_FAIL) - kvm_mips_emulate_exc(cause, opc, run, vcpu); + kvm_mips_emulate_exc(cause, opc, vcpu); return er; } @@ -3241,7 +3221,6 @@ enum emulation_result kvm_mips_check_privilege(u32 cause, */ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, u32 *opc, - struct kvm_run *run, struct kvm_vcpu *vcpu, bool write_fault) { @@ -3265,9 +3244,9 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, KVM_ENTRYHI_ASID)); if (index < 0) { if (exccode == EXCCODE_TLBL) { - er = kvm_mips_emulate_tlbmiss_ld(cause, opc, run, vcpu); + er = kvm_mips_emulate_tlbmiss_ld(cause, opc, vcpu); } else if (exccode == EXCCODE_TLBS) { - er = kvm_mips_emulate_tlbmiss_st(cause, opc, run, vcpu); + er = kvm_mips_emulate_tlbmiss_st(cause, opc, vcpu); } else { kvm_err("%s: invalid exc code: %d\n", __func__, exccode); @@ -3282,10 +3261,10 @@ enum emulation_result kvm_mips_handle_tlbmiss(u32 cause, */ if (!TLB_IS_VALID(*tlb, va)) { if (exccode == EXCCODE_TLBL) { - er = kvm_mips_emulate_tlbinv_ld(cause, opc, run, + er = kvm_mips_emulate_tlbinv_ld(cause, opc, vcpu); } else if (exccode == EXCCODE_TLBS) { - er = kvm_mips_emulate_tlbinv_st(cause, opc, run, + er = kvm_mips_emulate_tlbinv_st(cause, opc, vcpu); } else { kvm_err("%s: invalid exc code: %d\n", __func__, diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 666d3350b4ac..7de85d2253ff 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -450,7 +450,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; int r = -EINTR; vcpu_load(vcpu); @@ -459,11 +458,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) if (vcpu->mmio_needed) { if (!vcpu->mmio_is_write) - kvm_mips_complete_mmio_load(vcpu, run); + kvm_mips_complete_mmio_load(vcpu); vcpu->mmio_needed = 0; } - if (run->immediate_exit) + if (vcpu->run->immediate_exit) goto out; lose_fpu(1); @@ -480,7 +479,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - r = kvm_mips_callbacks->vcpu_run(run, vcpu); + r = kvm_mips_callbacks->vcpu_run(vcpu); trace_kvm_out(vcpu); guest_exit_irqoff(); @@ -1236,7 +1235,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) * end up causing an exception to be delivered to the Guest * Kernel */ - er = kvm_mips_check_privilege(cause, opc, run, vcpu); + er = kvm_mips_check_privilege(cause, opc, vcpu); if (er == EMULATE_PRIV_FAIL) { goto skip_emul; } else if (er == EMULATE_FAIL) { @@ -1385,7 +1384,7 @@ skip_emul: */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - kvm_mips_callbacks->vcpu_reenter(run, vcpu); + kvm_mips_callbacks->vcpu_reenter(vcpu); /* * If FPU / MSA are enabled (i.e. the guest's FPU / MSA context diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index 49bd160f4d85..87fa8d8a1031 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -25,41 +25,9 @@ #define KVM_MMU_CACHE_MIN_PAGES 2 #endif -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - int min, int max) -{ - void *page; - - BUG_ON(max > KVM_NR_MEM_OBJS); - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < max) { - page = (void *)__get_free_page(GFP_KERNEL); - if (!page) - return -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc || !mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; -} - void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } /** @@ -153,7 +121,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, if (!cache) return NULL; - new_pmd = mmu_memory_cache_alloc(cache); + new_pmd = kvm_mmu_memory_cache_alloc(cache); pmd_init((unsigned long)new_pmd, (unsigned long)invalid_pte_table); pud_populate(NULL, pud, new_pmd); @@ -164,7 +132,7 @@ static pte_t *kvm_mips_walk_pgd(pgd_t *pgd, struct kvm_mmu_memory_cache *cache, if (!cache) return NULL; - new_pte = mmu_memory_cache_alloc(cache); + new_pte = kvm_mmu_memory_cache_alloc(cache); clear_page(new_pte); pmd_populate_kernel(NULL, pmd, new_pte); } @@ -711,8 +679,7 @@ static int kvm_mips_map_page(struct kvm_vcpu *vcpu, unsigned long gpa, goto out; /* We need a minimum of cached pages ready for page table creation */ - err = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + err = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (err) goto out; @@ -796,8 +763,7 @@ static pte_t *kvm_trap_emul_pte_for_gva(struct kvm_vcpu *vcpu, int ret; /* We need a minimum of cached pages ready for page table creation */ - ret = mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES, - KVM_NR_MEM_OBJS); + ret = kvm_mmu_topup_memory_cache(memcache, KVM_MMU_CACHE_MIN_PAGES); if (ret) return NULL; diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index 34ad0b46e610..f8cba51e1054 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -67,7 +67,6 @@ static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -81,14 +80,14 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) * Unusable/no FPU in guest: * deliver guest COP1 Unusable Exception */ - er = kvm_mips_emulate_fpu_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_fpu_exc(cause, opc, vcpu); } else { /* Restore FPU state */ kvm_own_fpu(vcpu); er = EMULATE_DONE; } } else { - er = kvm_mips_emulate_inst(cause, opc, run, vcpu); + er = kvm_mips_emulate_inst(cause, opc, vcpu); } switch (er) { @@ -97,12 +96,12 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) break; case EMULATE_FAIL: - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; break; case EMULATE_WAIT: - run->exit_reason = KVM_EXIT_INTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; ret = RESUME_HOST; break; @@ -116,8 +115,7 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) return ret; } -static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu) +static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_vcpu *vcpu) { enum emulation_result er; union mips_instruction inst; @@ -125,7 +123,7 @@ static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, /* A code fetch fault doesn't count as an MMIO */ if (kvm_is_ifetch_fault(&vcpu->arch)) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } @@ -134,23 +132,22 @@ static int kvm_mips_bad_load(u32 cause, u32 *opc, struct kvm_run *run, opc += 1; err = kvm_get_badinstr(opc, vcpu, &inst.word); if (err) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } /* Emulate the load */ - er = kvm_mips_emulate_load(inst, cause, run, vcpu); + er = kvm_mips_emulate_load(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Emulate load from MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; } else { - run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = KVM_EXIT_MMIO; } return RESUME_HOST; } -static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run, - struct kvm_vcpu *vcpu) +static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_vcpu *vcpu) { enum emulation_result er; union mips_instruction inst; @@ -161,34 +158,33 @@ static int kvm_mips_bad_store(u32 cause, u32 *opc, struct kvm_run *run, opc += 1; err = kvm_get_badinstr(opc, vcpu, &inst.word); if (err) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } /* Emulate the store */ - er = kvm_mips_emulate_store(inst, cause, run, vcpu); + er = kvm_mips_emulate_store(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Emulate store to MMIO space failed\n"); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; } else { - run->exit_reason = KVM_EXIT_MMIO; + vcpu->run->exit_reason = KVM_EXIT_MMIO; } return RESUME_HOST; } -static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_run *run, +static int kvm_mips_bad_access(u32 cause, u32 *opc, struct kvm_vcpu *vcpu, bool store) { if (store) - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); else - return kvm_mips_bad_load(cause, opc, run, vcpu); + return kvm_mips_bad_load(cause, opc, vcpu); } static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; @@ -212,12 +208,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) * They would indicate stale host TLB entries. */ if (unlikely(index < 0)) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } tlb = vcpu->arch.guest_tlb + index; if (unlikely(!TLB_IS_VALID(*tlb, badvaddr))) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } @@ -226,23 +222,23 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu) * exception. Relay that on to the guest so it can handle it. */ if (!TLB_IS_DIRTY(*tlb, badvaddr)) { - kvm_mips_emulate_tlbmod(cause, opc, run, vcpu); + kvm_mips_emulate_tlbmod(cause, opc, vcpu); return RESUME_GUEST; } if (kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, badvaddr, true)) /* Not writable, needs handling as MMIO */ - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); return RESUME_GUEST; } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) { if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, true) < 0) /* Not writable, needs handling as MMIO */ - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); return RESUME_GUEST; } else { /* host kernel addresses are all handled as MMIO */ - return kvm_mips_bad_store(cause, opc, run, vcpu); + return kvm_mips_bad_store(cause, opc, vcpu); } } @@ -276,7 +272,7 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * into the shadow host TLB */ - er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu, store); + er = kvm_mips_handle_tlbmiss(cause, opc, vcpu, store); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { @@ -289,14 +285,14 @@ static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store) * not expect to ever get them */ if (kvm_mips_handle_kseg0_tlb_fault(badvaddr, vcpu, store) < 0) - ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); + ret = kvm_mips_bad_access(cause, opc, vcpu, store); } else if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { /* * With EVA we may get a TLB exception instead of an address * error when the guest performs MMIO to KSeg1 addresses. */ - ret = kvm_mips_bad_access(cause, opc, run, vcpu, store); + ret = kvm_mips_bad_access(cause, opc, vcpu, store); } else { kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n", store ? "ST" : "LD", cause, opc, badvaddr); @@ -320,7 +316,6 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; @@ -328,11 +323,11 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) if (KVM_GUEST_KERNEL_MODE(vcpu) && (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1)) { - ret = kvm_mips_bad_store(cause, opc, run, vcpu); + ret = kvm_mips_bad_store(cause, opc, vcpu); } else { kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -340,18 +335,17 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; u32 cause = vcpu->arch.host_cp0_cause; int ret = RESUME_GUEST; if (KSEGX(badvaddr) == CKSEG0 || KSEGX(badvaddr) == CKSEG1) { - ret = kvm_mips_bad_load(cause, opc, run, vcpu); + ret = kvm_mips_bad_load(cause, opc, vcpu); } else { kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n", cause, opc, badvaddr); - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -359,17 +353,16 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_syscall(cause, opc, run, vcpu); + er = kvm_mips_emulate_syscall(cause, opc, vcpu); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -377,17 +370,16 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_handle_ri(cause, opc, run, vcpu); + er = kvm_mips_handle_ri(cause, opc, vcpu); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -395,17 +387,16 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_bp_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_bp_exc(cause, opc, vcpu); if (er == EMULATE_DONE) ret = RESUME_GUEST; else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -413,17 +404,16 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_trap_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_trap_exc(cause, opc, vcpu); if (er == EMULATE_DONE) { ret = RESUME_GUEST; } else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -431,17 +421,16 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_msafpe_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_msafpe_exc(cause, opc, vcpu); if (er == EMULATE_DONE) { ret = RESUME_GUEST; } else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -449,17 +438,16 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *)vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; int ret = RESUME_GUEST; - er = kvm_mips_emulate_fpe_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_fpe_exc(cause, opc, vcpu); if (er == EMULATE_DONE) { ret = RESUME_GUEST; } else { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; } return ret; @@ -474,7 +462,6 @@ static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu) static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct kvm_run *run = vcpu->run; u32 __user *opc = (u32 __user *) vcpu->arch.pc; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_DONE; @@ -486,10 +473,10 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) * No MSA in guest, or FPU enabled and not in FR=1 mode, * guest reserved instruction exception */ - er = kvm_mips_emulate_ri_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_ri_exc(cause, opc, vcpu); } else if (!(kvm_read_c0_guest_config5(cop0) & MIPS_CONF5_MSAEN)) { /* MSA disabled by guest, guest MSA disabled exception */ - er = kvm_mips_emulate_msadis_exc(cause, opc, run, vcpu); + er = kvm_mips_emulate_msadis_exc(cause, opc, vcpu); } else { /* Restore MSA/FPU state */ kvm_own_msa(vcpu); @@ -502,7 +489,7 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) break; case EMULATE_FAIL: - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; break; @@ -1184,8 +1171,7 @@ void kvm_trap_emul_gva_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, - struct kvm_vcpu *vcpu) +static void kvm_trap_emul_vcpu_reenter(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; struct mm_struct *user_mm = &vcpu->arch.guest_user_mm; @@ -1228,7 +1214,7 @@ static void kvm_trap_emul_vcpu_reenter(struct kvm_run *run, check_mmu_context(mm); } -static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvm_trap_emul_vcpu_run(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); int r; @@ -1237,7 +1223,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_mips_deliver_interrupts(vcpu, kvm_read_c0_guest_cause(vcpu->arch.cop0)); - kvm_trap_emul_vcpu_reenter(run, vcpu); + kvm_trap_emul_vcpu_reenter(vcpu); /* * We use user accessors to access guest memory, but we don't want to @@ -1255,7 +1241,7 @@ static int kvm_trap_emul_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) */ kvm_mips_suspend_mm(cpu); - r = vcpu->arch.vcpu_run(run, vcpu); + r = vcpu->arch.vcpu_run(vcpu->run, vcpu); /* We may have migrated while handling guest exits */ cpu = smp_processor_id(); diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 9d03bd0a604a..3932f767e938 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -874,7 +874,6 @@ static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val) static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -1074,7 +1073,6 @@ static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst, static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst, u32 *opc, u32 cause, - struct kvm_run *run, struct kvm_vcpu *vcpu) { enum emulation_result er = EMULATE_DONE; @@ -1217,7 +1215,6 @@ static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, { enum emulation_result er = EMULATE_DONE; struct kvm_vcpu_arch *arch = &vcpu->arch; - struct kvm_run *run = vcpu->run; union mips_instruction inst; int rd, rt, sel; int err; @@ -1233,12 +1230,12 @@ static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, switch (inst.r_format.opcode) { case cop0_op: - er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu); + er = kvm_vz_gpsi_cop0(inst, opc, cause, vcpu); break; #ifndef CONFIG_CPU_MIPSR6 case cache_op: trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu); + er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu); break; #endif #ifdef CONFIG_CPU_LOONGSON64 @@ -1251,7 +1248,7 @@ static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, #ifdef CONFIG_CPU_MIPSR6 case cache6_op: trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); - er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu); + er = kvm_vz_gpsi_cache(inst, opc, cause, vcpu); break; #endif case rdhwr_op: @@ -1553,7 +1550,6 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu) */ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; u32 cause = vcpu->arch.host_cp0_cause; enum emulation_result er = EMULATE_FAIL; int ret = RESUME_GUEST; @@ -1581,7 +1577,7 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) break; case EMULATE_FAIL: - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ret = RESUME_HOST; break; @@ -1600,8 +1596,6 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) */ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) { - struct kvm_run *run = vcpu->run; - /* * If MSA not present or not exposed to guest or FR=0, the MSA operation * should have been treated as a reserved instruction! @@ -1612,7 +1606,7 @@ static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 || !(read_gc0_config5() & MIPS_CONF5_MSAEN) || vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return RESUME_HOST; } @@ -1648,7 +1642,7 @@ static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) } /* Treat as MMIO */ - er = kvm_mips_emulate_load(inst, cause, run, vcpu); + er = kvm_mips_emulate_load(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n", opc, badvaddr); @@ -1695,7 +1689,7 @@ static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu) } /* Treat as MMIO */ - er = kvm_mips_emulate_store(inst, cause, run, vcpu); + er = kvm_mips_emulate_store(inst, cause, vcpu); if (er == EMULATE_FAIL) { kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n", opc, badvaddr); @@ -3242,7 +3236,7 @@ static void kvm_vz_flush_shadow_memslot(struct kvm *kvm, kvm_vz_flush_shadow_all(kvm); } -static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) +static void kvm_vz_vcpu_reenter(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); int preserve_guest_tlb; @@ -3258,7 +3252,7 @@ static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_wired(vcpu); } -static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +static int kvm_vz_vcpu_run(struct kvm_vcpu *vcpu) { int cpu = smp_processor_id(); int r; @@ -3271,7 +3265,7 @@ static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) kvm_vz_vcpu_load_tlb(vcpu, cpu); kvm_vz_vcpu_load_wired(vcpu); - r = vcpu->arch.vcpu_run(run, vcpu); + r = vcpu->arch.vcpu_run(vcpu->run, vcpu); kvm_vz_vcpu_save_wired(vcpu); diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index dadbcf3a0b1e..2d444d09b553 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -4,6 +4,7 @@ generated-y += syscall_table_64.h generated-y += syscall_table_c32.h generated-y += syscall_table_spu.h generic-y += export.h +generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += vtime.h diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 83f6e85de7bc..319efa0e6d02 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -6,5 +6,6 @@ generated-y += unistd_nr.h generic-y += asm-offsets.h generic-y += export.h +generic-y += kvm_types.h generic-y += local64.h generic-y += mcs_spinlock.h diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 0036eab14391..ca8f85b53a90 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -298,10 +298,8 @@ struct diag26c_mac_resp { union diag318_info { unsigned long val; struct { - unsigned int cpnc : 8; - unsigned int cpvc_linux : 24; - unsigned char cpvc_distro[3]; - unsigned char zero; + unsigned long cpnc : 8; + unsigned long cpvc : 56; }; }; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 6ea0820e7c7f..463c24e26000 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -260,7 +260,8 @@ struct kvm_s390_sie_block { __u32 scaol; /* 0x0064 */ __u8 sdf; /* 0x0068 */ __u8 epdx; /* 0x0069 */ - __u8 reserved6a[2]; /* 0x006a */ + __u8 cpnc; /* 0x006a */ + __u8 reserved6b; /* 0x006b */ __u32 todpr; /* 0x006c */ #define GISA_FORMAT1 0x00000001 __u32 gd; /* 0x0070 */ @@ -745,6 +746,7 @@ struct kvm_vcpu_arch { bool gs_enabled; bool skey_enabled; struct kvm_s390_pv_vcpu pv; + union diag318_info diag318_info; }; struct kvm_vm_stat { diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index 436ec7636927..7a6b14874d65 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -231,11 +231,13 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_GSCB (1UL << 9) #define KVM_SYNC_BPBC (1UL << 10) #define KVM_SYNC_ETOKEN (1UL << 11) +#define KVM_SYNC_DIAG318 (1UL << 12) #define KVM_SYNC_S390_VALID_FIELDS \ (KVM_SYNC_PREFIX | KVM_SYNC_GPRS | KVM_SYNC_ACRS | KVM_SYNC_CRS | \ KVM_SYNC_ARCH0 | KVM_SYNC_PFAULT | KVM_SYNC_VRS | KVM_SYNC_RICCB | \ - KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN) + KVM_SYNC_FPRS | KVM_SYNC_GSCB | KVM_SYNC_BPBC | KVM_SYNC_ETOKEN | \ + KVM_SYNC_DIAG318) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 @@ -264,7 +266,8 @@ struct kvm_sync_regs { __u8 reserved2 : 7; __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ - __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + __u64 diag318; /* diagnose 0x318 info */ + __u8 padding2[184]; /* sdnx needs to be 256byte aligned */ union { __u8 sdnx[SDNXL]; /* state description annex */ struct { diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 0c4194d407ac..e600f6953d7c 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -1021,8 +1021,7 @@ static void __init setup_control_program_code(void) { union diag318_info diag318_info = { .cpnc = CPNC_LINUX, - .cpvc_linux = 0, - .cpvc_distro = {0}, + .cpvc = 0, }; if (!sclp.has_diag318) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d47c19718615..66da278a67fb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -545,6 +545,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_AIS_MIGRATION: case KVM_CAP_S390_VCPU_RESETS: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_S390_DIAG318: r = 1; break; case KVM_CAP_S390_HPAGE_1M: @@ -3267,7 +3268,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) KVM_SYNC_ACRS | KVM_SYNC_CRS | KVM_SYNC_ARCH0 | - KVM_SYNC_PFAULT; + KVM_SYNC_PFAULT | + KVM_SYNC_DIAG318; kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; @@ -3562,6 +3564,7 @@ static void kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->pp = 0; vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.sie_block->todpr = 0; + vcpu->arch.sie_block->cpnc = 0; } } @@ -3579,6 +3582,7 @@ static void kvm_arch_vcpu_ioctl_clear_reset(struct kvm_vcpu *vcpu) regs->etoken = 0; regs->etoken_extension = 0; + regs->diag318 = 0; } int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) @@ -3954,33 +3958,31 @@ bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) return true; } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu) { hva_t hva; struct kvm_arch_async_pf arch; - int rc; if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) - return 0; + return false; if ((vcpu->arch.sie_block->gpsw.mask & vcpu->arch.pfault_select) != vcpu->arch.pfault_compare) - return 0; + return false; if (psw_extint_disabled(vcpu)) - return 0; + return false; if (kvm_s390_vcpu_has_irq(vcpu, 0)) - return 0; + return false; if (!(vcpu->arch.sie_block->gcr[0] & CR0_SERVICE_SIGNAL_SUBMASK)) - return 0; + return false; if (!vcpu->arch.gmap->pfault_enabled) - return 0; + return false; hva = gfn_to_hva(vcpu->kvm, gpa_to_gfn(current->thread.gmap_addr)); hva += current->thread.gmap_addr & ~PAGE_MASK; if (read_guest_real(vcpu, vcpu->arch.pfault_token, &arch.pfault_token, 8)) - return 0; + return false; - rc = kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); - return rc; + return kvm_setup_async_pf(vcpu, current->thread.gmap_addr, hva, &arch); } static int vcpu_pre_run(struct kvm_vcpu *vcpu) @@ -4175,8 +4177,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return rc; } -static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; struct runtime_instr_cb *riccb; struct gs_cb *gscb; @@ -4196,6 +4199,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->arch.pfault_token == KVM_S390_PFAULT_TOKEN_INVALID) kvm_clear_async_pf_completion_queue(vcpu); } + if (kvm_run->kvm_dirty_regs & KVM_SYNC_DIAG318) { + vcpu->arch.diag318_info.val = kvm_run->s.regs.diag318; + vcpu->arch.sie_block->cpnc = vcpu->arch.diag318_info.cpnc; + } /* * If userspace sets the riccb (e.g. after migration) to a valid state, * we should enable RI here instead of doing the lazy enablement. @@ -4242,8 +4249,10 @@ static void sync_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* SIE will load etoken directly from SDNX and therefore kvm_run */ } -static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void sync_regs(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) kvm_s390_set_prefix(vcpu, kvm_run->s.regs.prefix); if (kvm_run->kvm_dirty_regs & KVM_SYNC_CRS) { @@ -4272,7 +4281,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Sync fmt2 only data */ if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) { - sync_regs_fmt2(vcpu, kvm_run); + sync_regs_fmt2(vcpu); } else { /* * In several places we have to modify our internal view to @@ -4291,12 +4300,15 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->kvm_dirty_regs = 0; } -static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void store_regs_fmt2(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr; kvm_run->s.regs.pp = vcpu->arch.sie_block->pp; kvm_run->s.regs.gbea = vcpu->arch.sie_block->gbea; kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; + kvm_run->s.regs.diag318 = vcpu->arch.diag318_info.val; if (MACHINE_HAS_GS) { __ctl_set_bit(2, 4); if (vcpu->arch.gs_enabled) @@ -4312,8 +4324,10 @@ static void store_regs_fmt2(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* SIE will save etoken directly into SDNX and therefore kvm_run */ } -static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void store_regs(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu); @@ -4332,7 +4346,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; if (likely(!kvm_s390_pv_cpu_is_protected(vcpu))) - store_regs_fmt2(vcpu, kvm_run); + store_regs_fmt2(vcpu); } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) @@ -4370,7 +4384,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - sync_regs(vcpu, kvm_run); + sync_regs(vcpu); enable_cpu_timer_accounting(vcpu); might_fault(); @@ -4392,7 +4406,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) } disable_cpu_timer_accounting(vcpu); - store_regs(vcpu, kvm_run); + store_regs(vcpu); kvm_sigset_deactivate(vcpu); diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 9e9056cebfcf..4f3cbf6003a9 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -548,6 +548,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ecd |= scb_o->ecd & ECD_ETOKENF; scb_s->hpid = HPID_VSIE; + scb_s->cpnc = scb_o->cpnc; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fd03cefabd34..9a2849527dd7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -803,6 +803,7 @@ config KVM_GUEST depends on PARAVIRT select PARAVIRT_CLOCK select ARCH_CPUIDLE_HALTPOLL + select X86_HV_CALLBACK_VECTOR default y help This option enables various optimizations for running under the KVM diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 07533795b8d2..275e7fd20310 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flush_l1d(void) __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } -static inline void kvm_clear_cpu_l1tf_flush_l1d(void) +static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0); } -static inline bool kvm_get_cpu_l1tf_flush_l1d(void) +static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) { return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index ff198fc2495e..a43366191212 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -632,6 +632,10 @@ DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_acrn_hv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_xen_hvm_callback); #endif +#ifdef CONFIG_KVM_GUEST +DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_kvm_asyncpf_interrupt); +#endif + #undef X86_TRAP_OTHER #endif diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index be5363b21540..5ab3af7275d8 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -193,8 +193,6 @@ struct x86_exception; enum x86_intercept; enum x86_intercept_stage; -#define KVM_NR_MEM_OBJS 40 - #define KVM_NR_DB_REGS 4 #define DR6_BD (1 << 13) @@ -245,15 +243,6 @@ enum x86_intercept_stage; struct kvm_kernel_irq_routing_entry; -/* - * We don't want allocation failures within the mmu code, so we preallocate - * enough memory for a single page fault in a cache. - */ -struct kvm_mmu_memory_cache { - int nobjs; - void *objects[KVM_NR_MEM_OBJS]; -}; - /* * the pages used as guest page table on soft mmu are tracked by * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used @@ -322,43 +311,6 @@ struct kvm_rmap_head { unsigned long val; }; -struct kvm_mmu_page { - struct list_head link; - struct hlist_node hash_link; - struct list_head lpage_disallowed_link; - - bool unsync; - u8 mmu_valid_gen; - bool mmio_cached; - bool lpage_disallowed; /* Can't be replaced by an equiv large page */ - - /* - * The following two entries are used to key the shadow page in the - * hash table. - */ - union kvm_mmu_page_role role; - gfn_t gfn; - - u64 *spt; - /* hold the gfn of each spte inside spt */ - gfn_t *gfns; - int root_count; /* Currently serving as active root */ - unsigned int unsync_children; - struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ - DECLARE_BITMAP(unsync_child_bitmap, 512); - -#ifdef CONFIG_X86_32 - /* - * Used out of the mmu-lock to avoid reading spte values while an - * update is in progress; see the comments in __get_spte_lockless(). - */ - int clear_spte_count; -#endif - - /* Number of writes since the last time traversal visited this page. */ - atomic_t write_flooding_count; -}; - struct kvm_pio_request { unsigned long linear_rip; unsigned long count; @@ -384,6 +336,8 @@ struct kvm_mmu_root_info { #define KVM_MMU_NUM_PREV_ROOTS 3 +struct kvm_mmu_page; + /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the @@ -580,6 +534,7 @@ struct kvm_vcpu_arch { unsigned long cr3; unsigned long cr4; unsigned long cr4_guest_owned_bits; + unsigned long cr4_guest_rsvd_bits; unsigned long cr8; u32 host_pkru; u32 pkru; @@ -635,7 +590,8 @@ struct kvm_vcpu_arch { struct kvm_mmu *walk_mmu; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; - struct kvm_mmu_memory_cache mmu_page_cache; + struct kvm_mmu_memory_cache mmu_shadow_page_cache; + struct kvm_mmu_memory_cache mmu_gfn_array_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* @@ -683,7 +639,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; - int tdp_level; + int max_tdp_level; /* emulate context */ @@ -827,6 +783,9 @@ struct kvm_vcpu_arch { /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; + /* Host CPU on which VM-entry was most recently attempted */ + unsigned int last_vmentry_cpu; + /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; }; @@ -1083,7 +1042,7 @@ struct kvm_x86_ops { void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); bool (*has_emulated_msr)(u32 index); - void (*cpuid_update)(struct kvm_vcpu *vcpu); + void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; int (*vm_init)(struct kvm *kvm); @@ -1098,7 +1057,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); @@ -1174,10 +1133,10 @@ struct kvm_x86_ops { int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); - int (*get_tdp_level)(struct kvm_vcpu *vcpu); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long cr3); + void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level); bool (*has_wbinvd_exit)(void); @@ -1220,7 +1179,6 @@ struct kvm_x86_ops { void (*enable_log_dirty_pt_masked)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t offset, unsigned long mask); - int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; @@ -1281,6 +1239,7 @@ struct kvm_x86_nested_ops { struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); @@ -1304,7 +1263,7 @@ struct kvm_arch_async_pf { }; extern u64 __read_mostly host_efer; - +extern bool __read_mostly allow_smaller_maxphyaddr; extern struct kvm_x86_ops kvm_x86_ops; #define __KVM_HAVE_ARCH_VM_ALLOC @@ -1549,20 +1508,8 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, bool skip_mmu_sync); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); - -static inline gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, - struct x86_exception *exception) -{ - return gpa; -} - -static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) -{ - struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); - - return (struct kvm_mmu_page *)page_private(page); -} +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { @@ -1636,7 +1583,15 @@ asmlinkage void kvm_spurious_fault(void); insn "\n\t" \ "jmp 668f \n\t" \ "667: \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_begin \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "call kvm_spurious_fault \n\t" \ + "1: \n\t" \ + ".pushsection .discard.instr_end \n\t" \ + ".long 1b - . \n\t" \ + ".popsection \n\t" \ "668: \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 49d3a9edb06f..338119852512 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -4,6 +4,7 @@ #include #include +#include #include extern void kvmclock_init(void); @@ -18,7 +19,7 @@ static inline bool kvm_check_and_clear_guest_paused(void) #endif /* CONFIG_KVM_GUEST */ #define KVM_HYPERCALL \ - ALTERNATIVE(".byte 0x0f,0x01,0xc1", ".byte 0x0f,0x01,0xd9", X86_FEATURE_VMMCALL) + ALTERNATIVE("vmcall", "vmmcall", X86_FEATURE_VMMCALL) /* For KVM hypercalls, a three-byte sequence of either the vmcall or the vmmcall * instruction. The hypervisor may replace it with something else but only the diff --git a/arch/x86/include/asm/kvm_types.h b/arch/x86/include/asm/kvm_types.h new file mode 100644 index 000000000000..08f1b57d3b62 --- /dev/null +++ b/arch/x86/include/asm/kvm_types.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_KVM_TYPES_H +#define _ASM_X86_KVM_TYPES_H + +#define KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE 40 + +#endif /* _ASM_X86_KVM_TYPES_H */ diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 444d6fd9a6d8..d86ab942219c 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,7 @@ extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); +extern bool nopvspin; #define queued_spin_unlock queued_spin_unlock /** diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 233c77d056c9..08320b0b2b27 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -7,8 +7,11 @@ * Authors: Anthony Liguori */ +#define pr_fmt(fmt) "kvm-guest: " fmt + #include #include +#include #include #include #include @@ -232,16 +235,11 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - u32 reason = kvm_read_and_reset_apf_flags(); + u32 flags = kvm_read_and_reset_apf_flags(); irqentry_state_t state; - switch (reason) { - case KVM_PV_REASON_PAGE_NOT_PRESENT: - case KVM_PV_REASON_PAGE_READY: - break; - default: + if (!flags) return false; - } state = irqentry_enter(regs); instrumentation_begin(); @@ -254,13 +252,13 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) if (unlikely(!(regs->flags & X86_EFLAGS_IF))) panic("Host injected async #PF in interrupt disabled region\n"); - if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { + if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { if (unlikely(!(user_mode(regs)))) panic("Host injected async #PF in kernel mode\n"); /* Page is swapped out by the host. */ kvm_async_pf_task_wait_schedule(token); } else { - kvm_async_pf_task_wake(token); + WARN_ONCE(1, "Unexpected async PF flags: %x\n", flags); } instrumentation_end(); @@ -268,6 +266,27 @@ noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) return true; } +DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + u32 token; + irqentry_state_t state; + + state = irqentry_enter(regs); + + inc_irq_stat(irq_hv_callback_count); + + if (__this_cpu_read(apf_reason.enabled)) { + token = __this_cpu_read(apf_reason.token); + kvm_async_pf_task_wake(token); + __this_cpu_write(apf_reason.token, 0); + wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + } + + irqentry_exit(regs, state); + set_irq_regs(old_regs); +} + static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -289,8 +308,8 @@ static void kvm_register_steal_time(void) return; wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); - pr_info("kvm-stealtime: cpu %d, msr %llx\n", - cpu, (unsigned long long) slow_virt_to_phys(st)); + pr_info("stealtime: cpu %d, msr %llx\n", cpu, + (unsigned long long) slow_virt_to_phys(st)); } static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -311,17 +330,19 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa; + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { + u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); - pa |= KVM_ASYNC_PF_ENABLED; + pa |= KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); @@ -493,7 +514,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) } else { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); min = max = apic_id; ipi_bitmap = 0; } @@ -503,7 +525,8 @@ static void __send_ipi_mask(const struct cpumask *mask, int vector) if (ipi_bitmap) { ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap, (unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr); - WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret); + WARN_ONCE(ret < 0, "kvm-guest: failed to send PV IPI: %ld", + ret); } local_irq_restore(flags); @@ -533,7 +556,7 @@ static void kvm_setup_pv_ipi(void) { apic->send_IPI_mask = kvm_send_ipi_mask; apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself; - pr_info("KVM setup pv IPIs\n"); + pr_info("setup PV IPIs\n"); } static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) @@ -551,13 +574,6 @@ static void kvm_smp_send_call_func_ipi(const struct cpumask *mask) } } -static void __init kvm_smp_prepare_cpus(unsigned int max_cpus) -{ - native_smp_prepare_cpus(max_cpus); - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - static_branch_disable(&virt_spin_lock_key); -} - static void __init kvm_smp_prepare_boot_cpu(void) { /* @@ -646,19 +662,20 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_INT) && kvmapf) { static_branch_enable(&kvm_async_pf_enabled); + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_kvm_asyncpf_interrupt); + } #ifdef CONFIG_SMP - smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; if (pv_sched_yield_supported()) { smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi; - pr_info("KVM setup pv sched yield\n"); + pr_info("setup PV sched yield\n"); } if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online", kvm_cpu_online, kvm_cpu_down_prepare) < 0) - pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n"); + pr_err("failed to install cpu hotplug callbacks\n"); #else sev_map_percpu_data(); kvm_guest_cpu_init(); @@ -854,16 +871,36 @@ asm( */ void __init kvm_spinlock_init(void) { - /* Does host kernel support KVM_FEATURE_PV_UNHALT? */ - if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) + /* + * In case host doesn't support KVM_FEATURE_PV_UNHALT there is still an + * advantage of keeping virt_spin_lock_key enabled: virt_spin_lock() is + * preferred over native qspinlock when vCPU is preempted. + */ + if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) { + pr_info("PV spinlocks disabled, no host support\n"); return; + } - if (kvm_para_has_hint(KVM_HINTS_REALTIME)) - return; + /* + * Disable PV spinlocks and use native qspinlock when dedicated pCPUs + * are available. + */ + if (kvm_para_has_hint(KVM_HINTS_REALTIME)) { + pr_info("PV spinlocks disabled with KVM_HINTS_REALTIME hints\n"); + goto out; + } - /* Don't use the pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) - return; + if (num_possible_cpus() == 1) { + pr_info("PV spinlocks disabled, single CPU\n"); + goto out; + } + + if (nopvspin) { + pr_info("PV spinlocks disabled, forced by \"nopvspin\" parameter\n"); + goto out; + } + + pr_info("PV spinlocks enabled\n"); __pv_init_lock_hash(); pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; @@ -876,6 +913,13 @@ void __init kvm_spinlock_init(void) pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); } + /* + * When PV spinlock is enabled which is preferred over + * virt_spin_lock(), virt_spin_lock_key's value is meaningless. + * Just disable it anyway. + */ +out: + static_branch_disable(&virt_spin_lock_key); } #endif /* CONFIG_PARAVIRT_SPINLOCKS */ @@ -895,8 +939,8 @@ static void kvm_enable_host_haltpoll(void *i) void arch_haltpoll_enable(unsigned int cpu) { if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) { - pr_err_once("kvm: host does not support poll control\n"); - pr_err_once("kvm: host upgrade recommended\n"); + pr_err_once("host does not support poll control\n"); + pr_err_once("host upgrade recommended\n"); return; } diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 8a294f9747aa..fa873e3e6e90 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -54,48 +54,9 @@ static u32 xstate_required_size(u64 xstate_bv, bool compacted) #define F feature_bit -int kvm_update_cpuid(struct kvm_vcpu *vcpu) +static int kvm_check_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; - struct kvm_lapic *apic = vcpu->arch.apic; - - best = kvm_find_cpuid_entry(vcpu, 1, 0); - if (!best) - return 0; - - /* Update OSXSAVE bit */ - if (boot_cpu_has(X86_FEATURE_XSAVE) && best->function == 0x1) - cpuid_entry_change(best, X86_FEATURE_OSXSAVE, - kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); - - cpuid_entry_change(best, X86_FEATURE_APIC, - vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); - - if (apic) { - if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) - apic->lapic_timer.timer_mode_mask = 3 << 17; - else - apic->lapic_timer.timer_mode_mask = 1 << 17; - } - - best = kvm_find_cpuid_entry(vcpu, 7, 0); - if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) - cpuid_entry_change(best, X86_FEATURE_OSPKE, - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); - - best = kvm_find_cpuid_entry(vcpu, 0xD, 0); - if (!best) { - vcpu->arch.guest_supported_xcr0 = 0; - } else { - vcpu->arch.guest_supported_xcr0 = - (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - best->ebx = xstate_required_size(vcpu->arch.xcr0, false); - } - - best = kvm_find_cpuid_entry(vcpu, 0xD, 1); - if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || - cpuid_entry_has(best, X86_FEATURE_XSAVEC))) - best->ebx = xstate_required_size(vcpu->arch.xcr0, true); /* * The existing code assumes virtual address is 48-bit or 57-bit in the @@ -109,6 +70,38 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) return -EINVAL; } + return 0; +} + +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best) { + /* Update OSXSAVE bit */ + if (boot_cpu_has(X86_FEATURE_XSAVE)) + cpuid_entry_change(best, X86_FEATURE_OSXSAVE, + kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)); + + cpuid_entry_change(best, X86_FEATURE_APIC, + vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE); + } + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7) + cpuid_entry_change(best, X86_FEATURE_OSPKE, + kvm_read_cr4_bits(vcpu, X86_CR4_PKE)); + + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (best) + best->ebx = xstate_required_size(vcpu->arch.xcr0, false); + + best = kvm_find_cpuid_entry(vcpu, 0xD, 1); + if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) || + cpuid_entry_has(best, X86_FEATURE_XSAVEC))) + best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) @@ -121,14 +114,39 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) vcpu->arch.ia32_misc_enable_msr & MSR_IA32_MISC_ENABLE_MWAIT); } +} + +static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + struct kvm_cpuid_entry2 *best; + + kvm_x86_ops.vcpu_after_set_cpuid(vcpu); + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (best && apic) { + if (cpuid_entry_has(best, X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + + kvm_apic_set_version(vcpu); + } + + best = kvm_find_cpuid_entry(vcpu, 0xD, 0); + if (!best) + vcpu->arch.guest_supported_xcr0 = 0; + else + vcpu->arch.guest_supported_xcr0 = + (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); - return 0; + vcpu->arch.cr4_guest_rsvd_bits = + __cr4_reserved_bits(guest_cpuid_has, vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); } static int is_efer_nx(void) @@ -203,10 +221,16 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_entries[i].padding[2] = 0; } vcpu->arch.cpuid_nent = cpuid->nent; + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + kvfree(cpuid_entries); + goto out; + } + cpuid_fix_nx_cap(vcpu); - kvm_apic_set_version(vcpu); - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); kvfree(cpuid_entries); out: @@ -227,9 +251,14 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; vcpu->arch.cpuid_nent = cpuid->nent; - kvm_apic_set_version(vcpu); - kvm_x86_ops.cpuid_update(vcpu); - r = kvm_update_cpuid(vcpu); + r = kvm_check_cpuid(vcpu); + if (r) { + vcpu->arch.cpuid_nent = 0; + goto out; + } + + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); out: return r; } @@ -604,7 +633,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) eax.split.bit_width = cap.bit_width_gp; eax.split.mask_length = cap.events_mask_len; - edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.num_counters_fixed = min(cap.num_counters_fixed, MAX_FIXED_COUNTERS); edx.split.bit_width_fixed = cap.bit_width_fixed; edx.split.reserved = 0; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 05434cd9342f..3a923ae15f2f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -9,7 +9,7 @@ extern u32 kvm_cpu_caps[NCAPINTS] __read_mostly; void kvm_set_cpu_caps(void); -int kvm_update_cpuid(struct kvm_vcpu *vcpu); +void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu); struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); int kvm_dev_ioctl_get_cpuid(struct kvm_cpuid2 *cpuid, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 4ce2ddd26c0b..5ccbee7165a2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -354,7 +354,6 @@ static inline int apic_lvt_nmi_mode(u32 lvt_val) void kvm_apic_set_version(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - struct kvm_cpuid_entry2 *feat; u32 v = APIC_VERSION; if (!lapic_in_kernel(vcpu)) @@ -367,8 +366,7 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) * version first and level-triggered interrupts never get EOIed in * IOAPIC. */ - feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) && + if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) && !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); @@ -2068,7 +2066,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; - kvm_lapic_set_reg(apic, APIC_TDCR, val); + kvm_lapic_set_reg(apic, APIC_TDCR, val & 0xb); update_divide_count(apic); if (apic->divide_count != old_divisor && apic->lapic_timer.period) { @@ -2085,7 +2083,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_SELF_IPI: if (apic_x2apic_mode(apic)) { - kvm_lapic_reg_write(apic, APIC_ICR, 0x40000 | (val & 0xff)); + kvm_lapic_reg_write(apic, APIC_ICR, + APIC_DEST_SELF | (val & APIC_VECTOR_MASK)); } else ret = 1; break; @@ -2232,7 +2231,7 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); if (!apic) return; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 444bb9c54548..5efc6081ca13 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -4,6 +4,7 @@ #include #include "kvm_cache_regs.h" +#include "cpuid.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -57,22 +58,14 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots); -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer); +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp); bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); -static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) -{ - if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) - return kvm->arch.n_max_mmu_pages - - kvm->arch.n_used_mmu_pages; - - return 0; -} - static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu) { if (likely(vcpu->arch.mmu->root_hpa != INVALID_PAGE)) @@ -97,9 +90,13 @@ static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu) static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu) { - if (VALID_PAGE(vcpu->arch.mmu->root_hpa)) - kvm_x86_ops.load_mmu_pgd(vcpu, vcpu->arch.mmu->root_hpa | - kvm_get_active_pcid(vcpu)); + u64 root_hpa = vcpu->arch.mmu->root_hpa; + + if (!VALID_PAGE(root_hpa)) + return; + + kvm_x86_ops.load_mmu_pgd(vcpu, root_hpa | kvm_get_active_pcid(vcpu), + vcpu->arch.mmu->shadow_root_level); } int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -158,6 +155,11 @@ static inline bool is_write_protection(struct kvm_vcpu *vcpu) return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } +static inline bool kvm_mmu_is_illegal_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return (gpa >= BIT_ULL(cpuid_maxphyaddr(vcpu))); +} + /* * Check if a given access (described through the I/D, W/R and U/S bits of a * page fault error code pfec) causes a permission fault with the given PTE @@ -218,11 +220,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, - struct kvm_memory_slot *slot, u64 gfn); -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa); +int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu); int kvm_mmu_post_init_vm(struct kvm *kvm); void kvm_mmu_pre_destroy_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e2e2e5cc7dc6..4e03841f053d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -18,6 +18,7 @@ #include "irq.h" #include "ioapic.h" #include "mmu.h" +#include "mmu_internal.h" #include "x86.h" #include "kvm_cache_regs.h" #include "kvm_emulate.h" @@ -91,7 +92,8 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); */ bool tdp_enabled = false; -static int max_page_level __read_mostly; +static int max_huge_page_level __read_mostly; +static int max_tdp_level __read_mostly; enum { AUDIT_PRE_PAGE_FAULT, @@ -515,6 +517,18 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) return likely(kvm_gen == spte_gen); } +static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, + struct x86_exception *exception) +{ + /* Check if guest physical address doesn't exceed guest maximum */ + if (kvm_mmu_is_illegal_gpa(vcpu, gpa)) { + exception->error_code |= PFERR_RSVD_MASK; + return UNMAPPED_GVA; + } + + return gpa; +} + /* * Sets the shadow PTE masks used by the MMU. * @@ -676,7 +690,7 @@ union split_spte { static void count_spte_clear(u64 *sptep, u64 spte) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (is_shadow_present_pte(spte)) return; @@ -760,7 +774,7 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte) */ static u64 __get_spte_lockless(u64 *sptep) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); union split_spte spte, *orig = (union split_spte *)sptep; int count; @@ -1060,94 +1074,40 @@ static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) local_irq_enable(); } -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, - struct kmem_cache *base_cache, int min) -{ - void *obj; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT); - if (!obj) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = obj; - } - return 0; -} - -static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache) -{ - return cache->nobjs; -} - -static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, - struct kmem_cache *cache) -{ - while (mc->nobjs) - kmem_cache_free(cache, mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, - int min) -{ - void *page; - - if (cache->nobjs >= min) - return 0; - while (cache->nobjs < ARRAY_SIZE(cache->objects)) { - page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT); - if (!page) - return cache->nobjs >= min ? 0 : -ENOMEM; - cache->objects[cache->nobjs++] = page; - } - return 0; -} - -static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc) -{ - while (mc->nobjs) - free_page((unsigned long)mc->objects[--mc->nobjs]); -} - -static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu) +static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) { int r; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache, 8 + PTE_PREFETCH_NUM); + /* 1 rmap, 1 parent PTE per level, and the prefetched rmaps. */ + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, + 1 + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM); if (r) - goto out; - r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); + return r; + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_shadow_page_cache, + PT64_ROOT_MAX_LEVEL); if (r) - goto out; - r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache, 4); -out: - return r; + return r; + if (maybe_indirect) { + r = kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_gfn_array_cache, + PT64_ROOT_MAX_LEVEL); + if (r) + return r; + } + return kvm_mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache, + PT64_ROOT_MAX_LEVEL); } static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) { - mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache, - pte_list_desc_cache); - mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, - mmu_page_header_cache); -} - -static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) -{ - void *p; - - BUG_ON(!mc->nobjs); - p = mc->objects[--mc->nobjs]; - return p; + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_shadow_page_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_gfn_array_cache); + kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); } static struct pte_list_desc *mmu_alloc_pte_list_desc(struct kvm_vcpu *vcpu) { - return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); + return kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache); } static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc) @@ -1415,10 +1375,10 @@ static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, static bool rmap_can_add(struct kvm_vcpu *vcpu) { - struct kvm_mmu_memory_cache *cache; + struct kvm_mmu_memory_cache *mc; - cache = &vcpu->arch.mmu_pte_list_desc_cache; - return mmu_memory_cache_free_objects(cache); + mc = &vcpu->arch.mmu_pte_list_desc_cache; + return kvm_mmu_memory_cache_nr_free_objects(mc); } static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -1426,7 +1386,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmap_head); @@ -1438,7 +1398,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) gfn_t gfn; struct kvm_rmap_head *rmap_head; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); rmap_head = gfn_to_rmap(kvm, gfn, sp); __pte_list_remove(spte, rmap_head); @@ -1530,7 +1490,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); + WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1542,7 +1502,7 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (__drop_large_spte(vcpu->kvm, sptep)) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level)); @@ -1738,21 +1698,6 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -/** - * kvm_arch_write_log_dirty - emulate dirty page logging - * @vcpu: Guest mode vcpu - * - * Emulate arch specific page modification logging for the - * nested hypervisor - */ -int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa) -{ - if (kvm_x86_ops.write_log_dirty) - return kvm_x86_ops.write_log_dirty(vcpu, l2_gpa); - - return 0; -} - bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn) { @@ -2016,7 +1961,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); @@ -2105,10 +2050,10 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, int direct { struct kvm_mmu_page *sp; - sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); - sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache); + sp->spt = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_shadow_page_cache); if (!direct) - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache); + sp->gfns = kvm_mmu_memory_cache_alloc(&vcpu->arch.mmu_gfn_array_cache); set_page_private(virt_to_page(sp->spt), (unsigned long)sp); /* @@ -2138,7 +2083,7 @@ static void mark_unsync(u64 *spte) struct kvm_mmu_page *sp; unsigned int index; - sp = page_header(__pa(spte)); + sp = sptep_to_sp(spte); index = spte - sp->spt; if (__test_and_set_bit(index, sp->unsync_child_bitmap)) return; @@ -2207,7 +2152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, continue; } - child = page_header(ent & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent & PT64_BASE_ADDR_MASK); if (child->unsync_children) { if (mmu_pages_add(pvec, child, i)) @@ -2258,15 +2203,14 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, static void kvm_mmu_commit_zap_page(struct kvm *kvm, struct list_head *invalid_list); - -#define for_each_valid_sp(_kvm, _sp, _gfn) \ - hlist_for_each_entry(_sp, \ - &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \ +#define for_each_valid_sp(_kvm, _sp, _list) \ + hlist_for_each_entry(_sp, _list, hash_link) \ if (is_obsolete_sp((_kvm), (_sp))) { \ } else #define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \ - for_each_valid_sp(_kvm, _sp, _gfn) \ + for_each_valid_sp(_kvm, _sp, \ + &(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \ if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else static inline bool is_ept_sp(struct kvm_mmu_page *sp) @@ -2464,9 +2408,7 @@ static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp) static void clear_sp_write_flooding_count(u64 *spte) { - struct kvm_mmu_page *sp = page_header(__pa(spte)); - - __clear_sp_write_flooding_count(sp); + __clear_sp_write_flooding_count(sptep_to_sp(spte)); } static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, @@ -2476,7 +2418,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, int direct, unsigned int access) { + bool direct_mmu = vcpu->arch.mmu->direct_map; union kvm_mmu_page_role role; + struct hlist_head *sp_list; unsigned quadrant; struct kvm_mmu_page *sp; bool need_sync = false; @@ -2490,13 +2434,14 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (role.direct) role.gpte_is_8_bytes = true; role.access = access; - if (!vcpu->arch.mmu->direct_map - && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { + if (!direct_mmu && vcpu->arch.mmu->root_level <= PT32_ROOT_LEVEL) { quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; role.quadrant = quadrant; } - for_each_valid_sp(vcpu->kvm, sp, gfn) { + + sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]; + for_each_valid_sp(vcpu->kvm, sp, sp_list) { if (sp->gfn != gfn) { collisions++; continue; @@ -2508,6 +2453,9 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; + if (direct_mmu) + goto trace_get_page; + if (sp->unsync) { /* The page is good, but __kvm_sync_page might still end * up zapping it. If so, break in order to rebuild it. @@ -2523,6 +2471,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); __clear_sp_write_flooding_count(sp); + +trace_get_page: trace_kvm_mmu_get_page(sp, false); goto out; } @@ -2533,8 +2483,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; - hlist_add_head(&sp->hash_link, - &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); + hlist_add_head(&sp->hash_link, sp_list); if (!direct) { /* * we should do write protection before syncing pages @@ -2548,7 +2497,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } - clear_page(sp->spt); trace_kvm_mmu_get_page(sp, true); kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush); @@ -2657,7 +2605,7 @@ static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, * so we should update the spte at this point to get * a new sp with the correct access. */ - child = page_header(*sptep & PT64_BASE_ADDR_MASK); + child = to_shadow_page(*sptep & PT64_BASE_ADDR_MASK); if (child->role.access == direct_access) return; @@ -2679,7 +2627,7 @@ static bool mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_large_pte(pte)) --kvm->stat.lpages; } else { - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); } return true; @@ -2757,10 +2705,23 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm, if (!sp->root_count) { /* Count self */ (*nr_zapped)++; - list_move(&sp->link, invalid_list); + + /* + * Already invalid pages (previously active roots) are not on + * the active page list. See list_del() in the "else" case of + * !sp->root_count. + */ + if (sp->role.invalid) + list_add(&sp->link, invalid_list); + else + list_move(&sp->link, invalid_list); kvm_mod_used_mmu_pages(kvm, -1); } else { - list_move(&sp->link, &kvm->arch.active_mmu_pages); + /* + * Remove the active root from the active page list, the root + * will be explicitly freed when the root_count hits zero. + */ + list_del(&sp->link); /* * Obsolete pages cannot be used on any vCPUs, see the comment @@ -2812,33 +2773,60 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm, } } -static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, - struct list_head *invalid_list) +static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm, + unsigned long nr_to_zap) { - struct kvm_mmu_page *sp; + unsigned long total_zapped = 0; + struct kvm_mmu_page *sp, *tmp; + LIST_HEAD(invalid_list); + bool unstable; + int nr_zapped; if (list_empty(&kvm->arch.active_mmu_pages)) - return false; + return 0; - sp = list_last_entry(&kvm->arch.active_mmu_pages, - struct kvm_mmu_page, link); - return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); +restart: + list_for_each_entry_safe(sp, tmp, &kvm->arch.active_mmu_pages, link) { + /* + * Don't zap active root pages, the page itself can't be freed + * and zapping it will just force vCPUs to realloc and reload. + */ + if (sp->root_count) + continue; + + unstable = __kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, + &nr_zapped); + total_zapped += nr_zapped; + if (total_zapped >= nr_to_zap) + break; + + if (unstable) + goto restart; + } + + kvm_mmu_commit_zap_page(kvm, &invalid_list); + + kvm->stat.mmu_recycled += total_zapped; + return total_zapped; +} + +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) +{ + if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; + + return 0; } static int make_mmu_pages_available(struct kvm_vcpu *vcpu) { - LIST_HEAD(invalid_list); + unsigned long avail = kvm_mmu_available_pages(vcpu->kvm); - if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES)) + if (likely(avail >= KVM_MIN_FREE_MMU_PAGES)) return 0; - while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) { - if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list)) - break; - - ++vcpu->kvm->stat.mmu_recycled; - } - kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); + kvm_mmu_zap_oldest_mmu_pages(vcpu->kvm, KVM_REFILL_PAGES - avail); if (!kvm_mmu_available_pages(vcpu->kvm)) return -ENOSPC; @@ -2851,17 +2839,12 @@ static int make_mmu_pages_available(struct kvm_vcpu *vcpu) */ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { - LIST_HEAD(invalid_list); - spin_lock(&kvm->mmu_lock); if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { - /* Need to free some mmu pages to achieve the goal. */ - while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) - if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - break; + kvm_mmu_zap_oldest_mmu_pages(kvm, kvm->arch.n_used_mmu_pages - + goal_nr_mmu_pages); - kvm_mmu_commit_zap_page(kvm, &invalid_list); goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } @@ -2999,7 +2982,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp_ad_disabled(sp)) spte |= SPTE_AD_DISABLED_MASK; else if (kvm_vcpu_ad_need_write_protect(vcpu)) @@ -3102,7 +3085,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, struct kvm_mmu_page *child; u64 pte = *sptep; - child = page_header(pte & PT64_BASE_ADDR_MASK); + child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, sptep); flush = true; } else if (pfn != spte_to_pfn(*sptep)) { @@ -3212,7 +3195,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) { struct kvm_mmu_page *sp; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); /* * Without accessed bits, there's no way to distinguish between @@ -3274,7 +3257,7 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - max_level = min(max_level, max_page_level); + max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) @@ -3520,7 +3503,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, if (!is_shadow_present_pte(spte)) break; - sp = page_header(__pa(iterator.sptep)); + sp = sptep_to_sp(iterator.sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3607,7 +3590,7 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa, if (!VALID_PAGE(*root_hpa)) return; - sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK); + sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK); --sp->root_count; if (!sp->root_count && sp->role.invalid) kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); @@ -3668,7 +3651,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn) { int ret = 0; - if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { + if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) { kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); ret = 1; } @@ -3837,7 +3820,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); /* * Even if another CPU was marking the SP as unsync-ed @@ -3871,7 +3854,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); mmu_sync_children(vcpu, sp); } } @@ -4045,8 +4028,8 @@ static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr) walk_shadow_page_lockless_end(vcpu); } -static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - gfn_t gfn) +static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + gfn_t gfn) { struct kvm_arch_async_pf arch; @@ -4108,16 +4091,16 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - r = mmu_topup_memory_caches(vcpu); + if (fast_page_fault(vcpu, gpa, error_code)) + return RET_PF_RETRY; + + r = mmu_topup_memory_caches(vcpu, false); if (r) return r; if (lpage_disallowed) max_level = PG_LEVEL_4K; - if (fast_page_fault(vcpu, gpa, error_code)) - return RET_PF_RETRY; - mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); @@ -4131,7 +4114,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) goto out_unlock; - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = __direct_map(vcpu, gpa, write, map_writable, max_level, pfn, prefault, is_tdp && lpage_disallowed); @@ -4156,6 +4140,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len) { int r = 1; + u32 flags = vcpu->arch.apf.host_apf_flags; #ifndef CONFIG_X86_64 /* A 64-bit CR2 should be impossible on 32-bit KVM. */ @@ -4164,28 +4149,22 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_flags) { - default: + if (!flags) { trace_kvm_page_fault(fault_address, error_code); if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, fault_address); r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn, insn_len); - break; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) { vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); - break; - case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_flags = 0; - local_irq_disable(); - kvm_async_pf_task_wake(fault_address); - local_irq_enable(); - break; + } else { + WARN_ONCE(1, "Unexpected host async PF flags: %x\n", flags); } + return r; } EXPORT_SYMBOL_GPL(kvm_handle_page_fault); @@ -4227,8 +4206,8 @@ static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { return (role.direct || pgd == root->pgd) && - VALID_PAGE(root->hpa) && page_header(root->hpa) && - role.word == page_header(root->hpa)->role.word; + VALID_PAGE(root->hpa) && to_shadow_page(root->hpa) && + role.word == to_shadow_page(root->hpa)->role.word; } /* @@ -4277,8 +4256,7 @@ static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) - return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && - cached_root_available(vcpu, new_pgd, new_role); + return cached_root_available(vcpu, new_pgd, new_role); return false; } @@ -4313,7 +4291,7 @@ static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, */ vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY); - __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); + __clear_sp_write_flooding_count(to_shadow_page(vcpu->arch.mmu->root_hpa)); } void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, @@ -4869,13 +4847,22 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, return role; } +static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) +{ + /* Use 5-level TDP if and only if it's useful/necessary. */ + if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) + return 4; + + return max_tdp_level; +} + static union kvm_mmu_role kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = vcpu->arch.tdp_level; + role.base.level = kvm_mmu_get_tdp_level(vcpu); role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4884,7 +4871,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; union kvm_mmu_role new_role = kvm_calc_tdp_mmu_root_page_role(vcpu, false); @@ -4896,7 +4883,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = vcpu->arch.tdp_level; + context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu); context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -4931,7 +4918,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) } static union kvm_mmu_role -kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu, bool base_only) { union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); @@ -4939,9 +4926,19 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) !is_write_protection(vcpu); role.base.smap_andnot_wp = role.ext.cr4_smap && !is_write_protection(vcpu); - role.base.direct = !is_paging(vcpu); role.base.gpte_is_8_bytes = !!is_pae(vcpu); + return role; +} + +static union kvm_mmu_role +kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, base_only); + + role.base.direct = !is_paging(vcpu); + if (!is_long_mode(vcpu)) role.base.level = PT32E_ROOT_LEVEL; else if (is_la57_mode(vcpu)) @@ -4952,15 +4949,10 @@ kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) return role; } -void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context, + u32 cr0, u32 cr4, u32 efer, + union kvm_mmu_role new_role) { - struct kvm_mmu *context = vcpu->arch.mmu; - union kvm_mmu_role new_role = - kvm_calc_shadow_mmu_root_page_role(vcpu, false); - - if (new_role.as_u64 == context->mmu_role.as_u64) - return; - if (!(cr0 & X86_CR0_PG)) nonpaging_init_context(vcpu, context); else if (efer & EFER_LMA) @@ -4973,7 +4965,43 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) context->mmu_role.as_u64 = new_role.as_u64; reset_shadow_zero_bits_mask(vcpu, context); } -EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); + +static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer) +{ + struct kvm_mmu *context = &vcpu->arch.root_mmu; + union kvm_mmu_role new_role = + kvm_calc_shadow_mmu_root_page_role(vcpu, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} + +static union kvm_mmu_role +kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu) +{ + union kvm_mmu_role role = + kvm_calc_shadow_root_page_role_common(vcpu, false); + + role.base.direct = false; + role.base.level = kvm_mmu_get_tdp_level(vcpu); + + return role; +} + +void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, u32 cr0, u32 cr4, u32 efer, + gpa_t nested_cr3) +{ + struct kvm_mmu *context = &vcpu->arch.guest_mmu; + union kvm_mmu_role new_role = kvm_calc_shadow_npt_root_page_role(vcpu); + + context->shadow_root_level = new_role.base.level; + + __kvm_mmu_new_pgd(vcpu, nested_cr3, new_role.base, false, false); + + if (new_role.as_u64 != context->mmu_role.as_u64) + shadow_mmu_init_context(vcpu, context, cr0, cr4, efer, new_role); +} +EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu); static union kvm_mmu_role kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, @@ -5007,7 +5035,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty, void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, bool accessed_dirty, gpa_t new_eptp) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.guest_mmu; u8 level = vmx_eptp_page_walk_level(new_eptp); union kvm_mmu_role new_role = kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, @@ -5041,7 +5069,7 @@ EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); static void init_kvm_softmmu(struct kvm_vcpu *vcpu) { - struct kvm_mmu *context = vcpu->arch.mmu; + struct kvm_mmu *context = &vcpu->arch.root_mmu; kvm_init_shadow_mmu(vcpu, kvm_read_cr0_bits(vcpu, X86_CR0_PG), @@ -5151,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) { int r; - r = mmu_topup_memory_caches(vcpu); + r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map); if (r) goto out; r = mmu_alloc_roots(vcpu); @@ -5345,7 +5373,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, * or not since pte prefetch is skiped if it does not have * enough objects in the cache. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); spin_lock(&vcpu->kvm->mmu_lock); @@ -5553,23 +5581,25 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) } EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva); -void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, + int tdp_huge_page_level) { tdp_enabled = enable_tdp; + max_tdp_level = tdp_max_root_level; /* - * max_page_level reflects the capabilities of KVM's MMU irrespective + * max_huge_page_level reflects KVM's MMU capabilities irrespective * of kernel support, e.g. KVM may be capable of using 1GB pages when * the kernel is not. But, KVM never creates a page size greater than * what is used by the kernel for any given HVA, i.e. the kernel's * capabilities are ultimately consulted by kvm_mmu_hugepage_adjust(). */ if (tdp_enabled) - max_page_level = tdp_page_level; + max_huge_page_level = tdp_huge_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PG_LEVEL_1G; + max_huge_page_level = PG_LEVEL_1G; else - max_page_level = PG_LEVEL_2M; + max_huge_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5665,7 +5695,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) + if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); @@ -5684,6 +5714,14 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) uint i; int ret; + vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache; + vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache; + vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO; + + vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO; + vcpu->arch.mmu = &vcpu->arch.root_mmu; vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; @@ -5732,12 +5770,11 @@ restart: break; /* - * Skip invalid pages with a non-zero root count, zapping pages - * with a non-zero root count will never succeed, i.e. the page - * will get thrown back on active_mmu_pages and we'll get stuck - * in an infinite loop. + * Invalid pages should never land back on the list of active + * pages. Skip the bogus page, otherwise we'll get stuck in an + * infinite loop if the page gets put back on the list (again). */ - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; /* @@ -5904,7 +5941,7 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, restart: for_each_rmap_spte(rmap_head, &iter, sptep) { - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); pfn = spte_to_pfn(*sptep); /* @@ -6015,7 +6052,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); restart: list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) { - if (sp->role.invalid && sp->root_count) + if (WARN_ON(sp->role.invalid)) continue; if (__kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list, &ign)) goto restart; @@ -6092,9 +6129,7 @@ mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) goto unlock; } - if (prepare_zap_oldest_mmu_page(kvm, &invalid_list)) - freed++; - kvm_mmu_commit_zap_page(kvm, &invalid_list); + freed = kvm_mmu_zap_oldest_mmu_pages(kvm, sc->nr_to_scan); unlock: spin_unlock(&kvm->mmu_lock); diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c similarity index 96% rename from arch/x86/kvm/mmu_audit.c rename to arch/x86/kvm/mmu/mmu_audit.c index 9d2844f87f6d..c8d51a37e2ce 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -45,7 +45,7 @@ static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, !is_last_spte(ent[i], level)) { struct kvm_mmu_page *child; - child = page_header(ent[i] & PT64_BASE_ADDR_MASK); + child = to_shadow_page(ent[i] & PT64_BASE_ADDR_MASK); __mmu_spte_walk(vcpu, child, fn, level - 1); } } @@ -62,7 +62,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) { hpa_t root = vcpu->arch.mmu->root_hpa; - sp = page_header(root); + sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, vcpu->arch.mmu->root_level); return; } @@ -72,7 +72,7 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn) if (root && VALID_PAGE(root)) { root &= PT64_BASE_ADDR_MASK; - sp = page_header(root); + sp = to_shadow_page(root); __mmu_spte_walk(vcpu, sp, fn, 2); } } @@ -97,7 +97,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) kvm_pfn_t pfn; hpa_t hpa; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->unsync) { if (level != PG_LEVEL_4K) { @@ -132,7 +132,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) struct kvm_memory_slot *slot; gfn_t gfn; - rev_sp = page_header(__pa(sptep)); + rev_sp = sptep_to_sp(sptep); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); slots = kvm_memslots_for_spte_role(kvm, rev_sp->role); @@ -165,7 +165,7 @@ static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); + struct kvm_mmu_page *sp = sptep_to_sp(sptep); if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync) audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync " diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h new file mode 100644 index 000000000000..3acf3b8eb469 --- /dev/null +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_MMU_INTERNAL_H +#define __KVM_X86_MMU_INTERNAL_H + +#include + +#include + +struct kvm_mmu_page { + struct list_head link; + struct hlist_node hash_link; + struct list_head lpage_disallowed_link; + + bool unsync; + u8 mmu_valid_gen; + bool mmio_cached; + bool lpage_disallowed; /* Can't be replaced by an equiv large page */ + + /* + * The following two entries are used to key the shadow page in the + * hash table. + */ + union kvm_mmu_page_role role; + gfn_t gfn; + + u64 *spt; + /* hold the gfn of each spte inside spt */ + gfn_t *gfns; + int root_count; /* Currently serving as active root */ + unsigned int unsync_children; + struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ + DECLARE_BITMAP(unsync_child_bitmap, 512); + +#ifdef CONFIG_X86_32 + /* + * Used out of the mmu-lock to avoid reading spte values while an + * update is in progress; see the comments in __get_spte_lockless(). + */ + int clear_spte_count; +#endif + + /* Number of writes since the last time traversal visited this page. */ + atomic_t write_flooding_count; +}; + +static inline struct kvm_mmu_page *to_shadow_page(hpa_t shadow_page) +{ + struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT); + + return (struct kvm_mmu_page *)page_private(page); +} + +static inline struct kvm_mmu_page *sptep_to_sp(u64 *sptep) +{ + return to_shadow_page(__pa(sptep)); +} + +void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, + struct kvm_memory_slot *slot, u64 gfn); + +#endif /* __KVM_X86_MMU_INTERNAL_H */ diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h similarity index 99% rename from arch/x86/kvm/mmutrace.h rename to arch/x86/kvm/mmu/mmutrace.h index ffcd96fc02d0..9d15bc0c535b 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -387,7 +387,7 @@ TRACE_EVENT( #endif /* _TRACE_KVMMMU_H */ #undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_PATH mmu #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE mmutrace diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index a7bcde34d1f2..a84a141a2ad2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,7 +16,7 @@ #include -#include "mmu.h" +#include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) { diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 275564a0ebdb..4dd6b1e5b8cf 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -260,7 +260,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT - if (kvm_arch_write_log_dirty(vcpu, addr)) + if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; @@ -596,7 +596,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *spte; int i; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (sp->role.level > PG_LEVEL_4K) return; @@ -789,10 +789,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); - r = mmu_topup_memory_caches(vcpu); - if (r) - return r; - /* * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. @@ -820,6 +816,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, return RET_PF_EMULATE; } + r = mmu_topup_memory_caches(vcpu, true); + if (r) + return r; + vcpu->arch.write_fault_to_shadow_pgtable = false; is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu, @@ -866,7 +866,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, goto out_unlock; kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); - if (make_mmu_pages_available(vcpu) < 0) + r = make_mmu_pages_available(vcpu); + if (r) goto out_unlock; r = FNAME(fetch)(vcpu, addr, &walker, write_fault, max_level, pfn, map_writable, prefault, lpage_disallowed); @@ -903,7 +904,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) * No need to check return value here, rmap_can_add() can * help us to skip pte prefetch later. */ - mmu_topup_memory_caches(vcpu); + mmu_topup_memory_caches(vcpu, true); if (!VALID_PAGE(root_hpa)) { WARN_ON(1); @@ -915,7 +916,7 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa) level = iterator.level; sptep = iterator.sptep; - sp = page_header(__pa(sptep)); + sp = sptep_to_sp(sptep); if (is_last_spte(*sptep, level)) { pt_element_t gpte; gpa_t pte_gpa; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index b86346903f2e..67741d2a0308 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -372,6 +372,11 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) if (!pmc) return 1; + if (!(kvm_read_cr4(vcpu) & X86_CR4_PCE) && + (kvm_x86_ops.get_cpl(vcpu) != 0) && + (kvm_read_cr0(vcpu) & X86_CR0_PE)) + return 1; + *data = pmc_read_counter(pmc) & mask; return 0; } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index ab85eed8a6cc..067fef51760c 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -15,6 +15,8 @@ #define VMWARE_BACKDOOR_PMC_REAL_TIME 0x10001 #define VMWARE_BACKDOOR_PMC_APPARENT_TIME 0x10002 +#define MAX_FIXED_COUNTERS 3 + struct kvm_event_hw_type_mapping { u8 eventsel; u8 unit_mask; diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index e80daa98682f..ac830cd50830 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -665,7 +665,7 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } else { vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; } - mark_dirty(vmcb, VMCB_AVIC); + vmcb_mark_dirty(vmcb, VMCB_AVIC); svm_set_pi_irte_mode(vcpu, activated); } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 6bceafb19108..fb68467e6049 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -48,13 +48,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, svm->vmcb->control.exit_info_1 &= ~0xffffffffULL; svm->vmcb->control.exit_info_1 |= fault->error_code; - /* - * The present bit is always zero for page structure faults on real - * hardware. - */ - if (svm->vmcb->control.exit_info_1 & (2ULL << 32)) - svm->vmcb->control.exit_info_1 &= ~1; - nested_svm_vmexit(svm); } @@ -87,11 +80,11 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) WARN_ON(mmu_is_nested(vcpu)); vcpu->arch.mmu = &vcpu->arch.guest_mmu; - kvm_init_shadow_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer); + kvm_init_shadow_npt_mmu(vcpu, X86_CR0_PG, hsave->save.cr4, hsave->save.efer, + svm->nested.ctl.nested_cr3); vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level; reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } @@ -106,7 +99,7 @@ void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h, *g; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); if (!is_guest_mode(&svm->vcpu)) return; @@ -222,8 +215,9 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vmcb *vmcb) +static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb) { + bool nested_vmcb_lma; if ((vmcb->save.efer & EFER_SVME) == 0) return false; @@ -231,6 +225,30 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) (vmcb->save.cr0 & X86_CR0_NW)) return false; + if (!kvm_dr6_valid(vmcb->save.dr6) || !kvm_dr7_valid(vmcb->save.dr7)) + return false; + + nested_vmcb_lma = + (vmcb->save.efer & EFER_LME) && + (vmcb->save.cr0 & X86_CR0_PG); + + if (!nested_vmcb_lma) { + if (vmcb->save.cr4 & X86_CR4_PAE) { + if (vmcb->save.cr3 & MSR_CR3_LEGACY_PAE_RESERVED_MASK) + return false; + } else { + if (vmcb->save.cr3 & MSR_CR3_LEGACY_RESERVED_MASK) + return false; + } + } else { + if (!(vmcb->save.cr4 & X86_CR4_PAE) || + !(vmcb->save.cr0 & X86_CR0_PE) || + (vmcb->save.cr3 & MSR_CR3_LONG_RESERVED_MASK)) + return false; + } + if (kvm_valid_cr4(&svm->vcpu, vmcb->save.cr4)) + return false; + return nested_vmcb_check_controls(&vmcb->control); } @@ -258,7 +276,7 @@ void sync_nested_vmcb_control(struct vcpu_svm *svm) /* Only a few fields of int_ctl are written by the processor. */ mask = V_IRQ_MASK | V_TPR_MASK; if (!(svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) && - is_intercept(svm, INTERCEPT_VINTR)) { + svm_is_intercept(svm, INTERCEPT_VINTR)) { /* * In order to request an interrupt window, L0 is usurping * svm->vmcb->control.int_ctl and possibly setting V_IRQ @@ -310,6 +328,42 @@ static void nested_vmcb_save_pending_event(struct vcpu_svm *svm, nested_vmcb->control.exit_int_info = exit_int_info; } +static inline bool nested_npt_enabled(struct vcpu_svm *svm) +{ + return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE; +} + +/* + * Load guest's/host's cr3 on nested vmentry or vmexit. @nested_npt is true + * if we are emulating VM-Entry into a guest with NPT enabled. + */ +static int nested_svm_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, + bool nested_npt) +{ + if (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)) + return -EINVAL; + + if (!nested_npt && is_pae_paging(vcpu) && + (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) + return -EINVAL; + } + + /* + * TODO: optimize unconditional TLB flush/MMU sync here and in + * kvm_init_shadow_npt_mmu(). + */ + if (!nested_npt) + kvm_mmu_new_pgd(vcpu, cr3, false, false); + + vcpu->arch.cr3 = cr3; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); + + kvm_init_mmu(vcpu, false); + + return 0; +} + static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_vmcb) { /* Load the nested guest state */ @@ -323,8 +377,6 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); - (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); - svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax); kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp); @@ -342,14 +394,10 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *nested_v static void nested_prepare_vmcb_control(struct vcpu_svm *svm) { const u32 mask = V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK; - if (svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) + + if (nested_npt_enabled(svm)) nested_svm_init_mmu_context(&svm->vcpu); - /* Guest paging mode is active - reset mmu */ - kvm_mmu_reset_context(&svm->vcpu); - - svm_flush_tlb(&svm->vcpu); - svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; @@ -375,18 +423,27 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) */ recalc_intercepts(svm); - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb) { + int ret; + svm->nested.vmcb = vmcb_gpa; load_nested_vmcb_control(svm, &nested_vmcb->control); nested_prepare_vmcb_save(svm, nested_vmcb); nested_prepare_vmcb_control(svm); + ret = nested_svm_load_cr3(&svm->vcpu, nested_vmcb->save.cr3, + nested_npt_enabled(svm)); + if (ret) + return ret; + svm_set_gif(svm, true); + + return 0; } int nested_svm_vmrun(struct vcpu_svm *svm) @@ -416,7 +473,7 @@ int nested_svm_vmrun(struct vcpu_svm *svm) nested_vmcb = map.hva; - if (!nested_vmcb_checks(nested_vmcb)) { + if (!nested_vmcb_checks(svm, nested_vmcb)) { nested_vmcb->control.exit_code = SVM_EXIT_ERR; nested_vmcb->control.exit_code_hi = 0; nested_vmcb->control.exit_info_1 = 0; @@ -464,16 +521,22 @@ int nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(&hsave->control, &vmcb->control); svm->nested.nested_run_pending = 1; - enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb); - if (!nested_svm_vmrun_msrpm(svm)) { - svm->vmcb->control.exit_code = SVM_EXIT_ERR; - svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = 0; - svm->vmcb->control.exit_info_2 = 0; + if (enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb)) + goto out_exit_err; - nested_svm_vmexit(svm); - } + if (nested_svm_vmrun_msrpm(svm)) + goto out; + +out_exit_err: + svm->nested.nested_run_pending = 0; + + svm->vmcb->control.exit_code = SVM_EXIT_ERR; + svm->vmcb->control.exit_code_hi = 0; + svm->vmcb->control.exit_info_1 = 0; + svm->vmcb->control.exit_info_2 = 0; + + nested_svm_vmexit(svm); out: kvm_vcpu_unmap(&svm->vcpu, &map, true); @@ -585,12 +648,6 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); - if (npt_enabled) { - svm->vmcb->save.cr3 = hsave->save.cr3; - svm->vcpu.arch.cr3 = hsave->save.cr3; - } else { - (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); - } kvm_rax_write(&svm->vcpu, hsave->save.rax); kvm_rsp_write(&svm->vcpu, hsave->save.rsp); kvm_rip_write(&svm->vcpu, hsave->save.rip); @@ -598,7 +655,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.cpl = 0; svm->vmcb->control.exit_int_info = 0; - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); trace_kvm_nested_vmexit_inject(nested_vmcb->control.exit_code, nested_vmcb->control.exit_info_1, @@ -610,8 +667,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm) kvm_vcpu_unmap(&svm->vcpu, &map, true); nested_svm_uninit_mmu_context(&svm->vcpu); - kvm_mmu_reset_context(&svm->vcpu); - kvm_mmu_load(&svm->vcpu); + + rc = nested_svm_load_cr3(&svm->vcpu, hsave->save.cr3, false); + if (rc) + return 1; + + if (npt_enabled) + svm->vmcb->save.cr3 = hsave->save.cr3; /* * Drop what we picked up for L2 via svm_complete_interrupts() so it diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 5573a97f1520..402dc4234e39 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -313,13 +313,15 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, int write) { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - unsigned long npages, npinned, size; + unsigned long npages, size; + int npinned; unsigned long locked, lock_limit; struct page **pages; unsigned long first, last; + int ret; if (ulen == 0 || uaddr + ulen < uaddr) - return NULL; + return ERR_PTR(-EINVAL); /* Calculate number of pages. */ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; @@ -330,9 +332,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); - return NULL; + return ERR_PTR(-ENOMEM); } + if (WARN_ON_ONCE(npages > INT_MAX)) + return ERR_PTR(-EINVAL); + /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) @@ -341,12 +346,13 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, pages = kmalloc(size, GFP_KERNEL_ACCOUNT); if (!pages) - return NULL; + return ERR_PTR(-ENOMEM); /* Pin the user virtual address. */ - npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + npinned = pin_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); if (npinned != npages) { pr_err("SEV: Failure locking %lu pages.\n", npages); + ret = -ENOMEM; goto err; } @@ -357,10 +363,10 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, err: if (npinned > 0) - release_pages(pages, npinned); + unpin_user_pages(pages, npinned); kvfree(pages); - return NULL; + return ERR_PTR(ret); } static void sev_unpin_memory(struct kvm *kvm, struct page **pages, @@ -368,7 +374,7 @@ static void sev_unpin_memory(struct kvm *kvm, struct page **pages, { struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; - release_pages(pages, npages); + unpin_user_pages(pages, npages); kvfree(pages); sev->pages_locked -= npages; } @@ -434,8 +440,8 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) /* Lock the user memory. */ inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); - if (!inpages) { - ret = -ENOMEM; + if (IS_ERR(inpages)) { + ret = PTR_ERR(inpages); goto e_free; } @@ -789,13 +795,13 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) /* lock userspace source and destination page */ src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); - if (!src_p) - return -EFAULT; + if (IS_ERR(src_p)) + return PTR_ERR(src_p); dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); - if (!dst_p) { + if (IS_ERR(dst_p)) { sev_unpin_memory(kvm, src_p, n); - return -EFAULT; + return PTR_ERR(dst_p); } /* @@ -860,8 +866,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) return -EFAULT; pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); - if (!pages) - return -ENOMEM; + if (IS_ERR(pages)) + return PTR_ERR(pages); /* * The secret must be copied into contiguous memory region, lets verify @@ -987,8 +993,8 @@ int svm_register_enc_region(struct kvm *kvm, return -ENOMEM; region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); - if (!region->pages) { - ret = -ENOMEM; + if (IS_ERR(region->pages)) { + ret = PTR_ERR(region->pages); goto e_free; } @@ -1180,11 +1186,10 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) * 2) or this VMCB was executed on different host CPU in previous VMRUNs. */ if (sd->sev_vmcbs[asid] == svm->vmcb && - svm->last_cpu == cpu) + svm->vcpu.arch.last_vmentry_cpu == cpu) return; - svm->last_cpu = cpu; sd->sev_vmcbs[asid] = svm->vmcb; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; - mark_dirty(svm->vmcb, VMCB_ASID); + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5bbf76189afa..03dd7bac8034 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -254,7 +254,7 @@ static inline void invlpga(unsigned long addr, u32 asid) asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr)); } -static int get_npt_level(struct kvm_vcpu *vcpu) +static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 return PT64_ROOT_4LEVEL; @@ -282,7 +282,7 @@ void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) } svm->vmcb->save.efer = efer | EFER_SVME; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); } static int is_external_interrupt(u32 info) @@ -713,7 +713,7 @@ static void grow_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_max); if (control->pause_filter_count != old) { - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); trace_kvm_ple_window_update(vcpu->vcpu_id, control->pause_filter_count, old); } @@ -731,7 +731,7 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu) pause_filter_count_shrink, pause_filter_count); if (control->pause_filter_count != old) { - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); trace_kvm_ple_window_update(vcpu->vcpu_id, control->pause_filter_count, old); } @@ -885,7 +885,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled, PG_LEVEL_1G); + kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { @@ -924,6 +924,21 @@ static __init int svm_hardware_setup(void) svm_set_cpu_caps(); + /* + * It seems that on AMD processors PTE's accessed bit is + * being set by the CPU hardware before the NPF vmexit. + * This is not expected behaviour and our tests fail because + * of it. + * A workaround here is to disable support for + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. + * In this case userspace can know if there is support using + * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle + * it + * If future AMD CPU models change the behaviour described above, + * this variable can be changed accordingly + */ + allow_smaller_maxphyaddr = !npt_enabled; + return 0; err: @@ -966,7 +981,7 @@ static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) svm->vmcb->control.tsc_offset = offset + g_tsc_offset; - mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS); return svm->vmcb->control.tsc_offset; } @@ -1002,38 +1017,38 @@ static void init_vmcb(struct vcpu_svm *svm) if (enable_vmware_backdoor) set_exception_intercept(svm, GP_VECTOR); - set_intercept(svm, INTERCEPT_INTR); - set_intercept(svm, INTERCEPT_NMI); - set_intercept(svm, INTERCEPT_SMI); - set_intercept(svm, INTERCEPT_SELECTIVE_CR0); - set_intercept(svm, INTERCEPT_RDPMC); - set_intercept(svm, INTERCEPT_CPUID); - set_intercept(svm, INTERCEPT_INVD); - set_intercept(svm, INTERCEPT_INVLPG); - set_intercept(svm, INTERCEPT_INVLPGA); - set_intercept(svm, INTERCEPT_IOIO_PROT); - set_intercept(svm, INTERCEPT_MSR_PROT); - set_intercept(svm, INTERCEPT_TASK_SWITCH); - set_intercept(svm, INTERCEPT_SHUTDOWN); - set_intercept(svm, INTERCEPT_VMRUN); - set_intercept(svm, INTERCEPT_VMMCALL); - set_intercept(svm, INTERCEPT_VMLOAD); - set_intercept(svm, INTERCEPT_VMSAVE); - set_intercept(svm, INTERCEPT_STGI); - set_intercept(svm, INTERCEPT_CLGI); - set_intercept(svm, INTERCEPT_SKINIT); - set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_XSETBV); - set_intercept(svm, INTERCEPT_RDPRU); - set_intercept(svm, INTERCEPT_RSM); + svm_set_intercept(svm, INTERCEPT_INTR); + svm_set_intercept(svm, INTERCEPT_NMI); + svm_set_intercept(svm, INTERCEPT_SMI); + svm_set_intercept(svm, INTERCEPT_SELECTIVE_CR0); + svm_set_intercept(svm, INTERCEPT_RDPMC); + svm_set_intercept(svm, INTERCEPT_CPUID); + svm_set_intercept(svm, INTERCEPT_INVD); + svm_set_intercept(svm, INTERCEPT_INVLPG); + svm_set_intercept(svm, INTERCEPT_INVLPGA); + svm_set_intercept(svm, INTERCEPT_IOIO_PROT); + svm_set_intercept(svm, INTERCEPT_MSR_PROT); + svm_set_intercept(svm, INTERCEPT_TASK_SWITCH); + svm_set_intercept(svm, INTERCEPT_SHUTDOWN); + svm_set_intercept(svm, INTERCEPT_VMRUN); + svm_set_intercept(svm, INTERCEPT_VMMCALL); + svm_set_intercept(svm, INTERCEPT_VMLOAD); + svm_set_intercept(svm, INTERCEPT_VMSAVE); + svm_set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_CLGI); + svm_set_intercept(svm, INTERCEPT_SKINIT); + svm_set_intercept(svm, INTERCEPT_WBINVD); + svm_set_intercept(svm, INTERCEPT_XSETBV); + svm_set_intercept(svm, INTERCEPT_RDPRU); + svm_set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest(svm->vcpu.kvm)) { - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); + svm_set_intercept(svm, INTERCEPT_MONITOR); + svm_set_intercept(svm, INTERCEPT_MWAIT); } if (!kvm_hlt_in_guest(svm->vcpu.kvm)) - set_intercept(svm, INTERCEPT_HLT); + svm_set_intercept(svm, INTERCEPT_HLT); control->iopm_base_pa = __sme_set(iopm_base); control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); @@ -1077,7 +1092,7 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; - clr_intercept(svm, INTERCEPT_INVLPG); + svm_clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); clr_cr_intercept(svm, INTERCEPT_CR3_WRITE); @@ -1094,9 +1109,9 @@ static void init_vmcb(struct vcpu_svm *svm) control->pause_filter_count = pause_filter_count; if (pause_filter_thresh) control->pause_filter_thresh = pause_filter_thresh; - set_intercept(svm, INTERCEPT_PAUSE); + svm_set_intercept(svm, INTERCEPT_PAUSE); } else { - clr_intercept(svm, INTERCEPT_PAUSE); + svm_clr_intercept(svm, INTERCEPT_PAUSE); } if (kvm_vcpu_apicv_active(&svm->vcpu)) @@ -1107,14 +1122,14 @@ static void init_vmcb(struct vcpu_svm *svm) * in VMCB and clear intercepts to avoid #VMEXIT. */ if (vls) { - clr_intercept(svm, INTERCEPT_VMLOAD); - clr_intercept(svm, INTERCEPT_VMSAVE); + svm_clr_intercept(svm, INTERCEPT_VMLOAD); + svm_clr_intercept(svm, INTERCEPT_VMSAVE); svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; } if (vgif) { - clr_intercept(svm, INTERCEPT_STGI); - clr_intercept(svm, INTERCEPT_CLGI); + svm_clr_intercept(svm, INTERCEPT_STGI); + svm_clr_intercept(svm, INTERCEPT_CLGI); svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } @@ -1123,7 +1138,7 @@ static void init_vmcb(struct vcpu_svm *svm) clr_exception_intercept(svm, UD_VECTOR); } - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1257,7 +1272,7 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(cpu != vcpu->cpu)) { svm->asid_generation = 0; - mark_all_dirty(svm->vmcb); + vmcb_mark_all_dirty(svm->vmcb); } #ifdef CONFIG_X86_64 @@ -1356,7 +1371,7 @@ static void svm_set_vintr(struct vcpu_svm *svm) /* The following fields are ignored when AVIC is enabled */ WARN_ON(kvm_vcpu_apicv_active(&svm->vcpu)); - set_intercept(svm, INTERCEPT_VINTR); + svm_set_intercept(svm, INTERCEPT_VINTR); /* * This is just a dummy VINTR to actually cause a vmexit to happen. @@ -1367,13 +1382,13 @@ static void svm_set_vintr(struct vcpu_svm *svm) control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); - mark_dirty(svm->vmcb, VMCB_INTR); + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); } static void svm_clear_vintr(struct vcpu_svm *svm) { const u32 mask = V_TPR_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK | V_INTR_MASKING_MASK; - clr_intercept(svm, INTERCEPT_VINTR); + svm_clr_intercept(svm, INTERCEPT_VINTR); /* Drop int_ctl fields related to VINTR injection. */ svm->vmcb->control.int_ctl &= mask; @@ -1385,7 +1400,7 @@ static void svm_clear_vintr(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & ~mask; } - mark_dirty(svm->vmcb, VMCB_INTR); + vmcb_mark_dirty(svm->vmcb, VMCB_INTR); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -1503,7 +1518,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) svm->vmcb->save.idtr.limit = dt->size; svm->vmcb->save.idtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + vmcb_mark_dirty(svm->vmcb, VMCB_DT); } static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) @@ -1520,7 +1535,7 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) svm->vmcb->save.gdtr.limit = dt->size; svm->vmcb->save.gdtr.base = dt->address ; - mark_dirty(svm->vmcb, VMCB_DT); + vmcb_mark_dirty(svm->vmcb, VMCB_DT); } static void update_cr0_intercept(struct vcpu_svm *svm) @@ -1531,7 +1546,7 @@ static void update_cr0_intercept(struct vcpu_svm *svm) *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) | (gcr0 & SVM_CR0_SELECTIVE_MASK); - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); if (gcr0 == *hcr0) { clr_cr_intercept(svm, INTERCEPT_CR0_READ); @@ -1572,7 +1587,7 @@ void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); update_cr0_intercept(svm); } @@ -1592,7 +1607,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) cr4 |= X86_CR4_PAE; cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; - mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); return 0; } @@ -1624,10 +1639,10 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, /* This is symmetric with svm_get_segment() */ svm->vmcb->save.cpl = (var->dpl & 3); - mark_dirty(svm->vmcb, VMCB_SEG); + vmcb_mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_bp_intercept(struct kvm_vcpu *vcpu) +static void update_exception_bitmap(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1636,8 +1651,7 @@ static void update_bp_intercept(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); - } else - vcpu->guest_debug = 0; + } } static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) @@ -1651,7 +1665,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) svm->asid_generation = sd->asid_generation; svm->vmcb->control.asid = sd->next_asid++; - mark_dirty(svm->vmcb, VMCB_ASID); + vmcb_mark_dirty(svm->vmcb, VMCB_ASID); } static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) @@ -1660,7 +1674,7 @@ static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) if (unlikely(value != vmcb->save.dr6)) { vmcb->save.dr6 = value; - mark_dirty(vmcb, VMCB_DR); + vmcb_mark_dirty(vmcb, VMCB_DR); } } @@ -1687,7 +1701,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.dr7 = value; - mark_dirty(svm->vmcb, VMCB_DR); + vmcb_mark_dirty(svm->vmcb, VMCB_DR); } static int pf_interception(struct vcpu_svm *svm) @@ -2000,8 +2014,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value) * again while processing KVM_REQ_EVENT if needed. */ if (vgif_enabled(svm)) - clr_intercept(svm, INTERCEPT_STGI); - if (is_intercept(svm, INTERCEPT_VINTR)) + svm_clr_intercept(svm, INTERCEPT_STGI); + if (svm_is_intercept(svm, INTERCEPT_VINTR)) svm_clear_vintr(svm); enable_gif(svm); @@ -2162,7 +2176,7 @@ static int cpuid_interception(struct vcpu_svm *svm) static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; - clr_intercept(svm, INTERCEPT_IRET); + svm_clr_intercept(svm, INTERCEPT_IRET); svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); @@ -2358,8 +2372,10 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; break; + case MSR_IA32_PERF_CAPABILITIES: + return 0; default: - return 1; + return KVM_MSR_RET_INVALID; } return 0; @@ -2512,7 +2528,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; vcpu->arch.pat = data; svm->vmcb->save.g_pat = data; - mark_dirty(svm->vmcb, VMCB_NPT); + vmcb_mark_dirty(svm->vmcb, VMCB_NPT); break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && @@ -2522,7 +2538,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; svm->spec_ctrl = data; @@ -2617,7 +2633,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) return 1; svm->vmcb->save.dbgctl = data; - mark_dirty(svm->vmcb, VMCB_LBR); + vmcb_mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) svm_enable_lbrv(svm); else @@ -2947,6 +2963,7 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; kvm_run->fail_entry.hardware_entry_failure_reason = svm->vmcb->control.exit_code; + kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; dump_vmcb(vcpu); return 0; } @@ -2970,8 +2987,9 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -2992,21 +3010,18 @@ static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) static void reload_tss(struct kvm_vcpu *vcpu) { - int cpu = raw_smp_processor_id(); + struct svm_cpu_data *sd = per_cpu(svm_data, vcpu->cpu); - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); sd->tss_desc->type = 9; /* available 32/64-bit TSS */ load_TR_desc(); } static void pre_svm_run(struct vcpu_svm *svm) { - int cpu = raw_smp_processor_id(); - - struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + struct svm_cpu_data *sd = per_cpu(svm_data, svm->vcpu.cpu); if (sev_guest(svm->vcpu.kvm)) - return pre_sev_run(svm, cpu); + return pre_sev_run(svm, svm->vcpu.cpu); /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != sd->asid_generation) @@ -3019,7 +3034,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu) svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm_set_intercept(svm, INTERCEPT_IRET); ++vcpu->stat.nmi_injections; } @@ -3040,7 +3055,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; clr_cr_intercept(svm, INTERCEPT_CR8_WRITE); @@ -3096,10 +3111,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) if (masked) { svm->vcpu.arch.hflags |= HF_NMI_MASK; - set_intercept(svm, INTERCEPT_IRET); + svm_set_intercept(svm, INTERCEPT_IRET); } else { svm->vcpu.arch.hflags &= ~HF_NMI_MASK; - clr_intercept(svm, INTERCEPT_IRET); + svm_clr_intercept(svm, INTERCEPT_IRET); } } @@ -3179,7 +3194,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) if (!gif_set(svm)) { if (vgif_enabled(svm)) - set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_STGI); return; /* STGI will cause a vm exit */ } @@ -3234,7 +3249,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - if (svm_nested_virtualize_tpr(vcpu)) + if (nested_svm_virtualize_tpr(vcpu)) return; if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) { @@ -3248,7 +3263,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u64 cr8; - if (svm_nested_virtualize_tpr(vcpu) || + if (nested_svm_virtualize_tpr(vcpu) || kvm_vcpu_apicv_active(vcpu)) return; @@ -3344,6 +3359,60 @@ static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); +static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_svm *svm) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); + +#ifdef CONFIG_X86_64 + native_wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif +#endif + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -3399,16 +3468,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + svm_vcpu_enter_exit(vcpu, svm); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -3477,11 +3537,12 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) SVM_EXIT_EXCP_BASE + MC_VECTOR)) svm_handle_mce(svm); - mark_all_clean(svm->vmcb); + vmcb_mark_all_clean(svm->vmcb); return exit_fastpath; } -static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) +static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root, + int root_level) { struct vcpu_svm *svm = to_svm(vcpu); unsigned long cr3; @@ -3489,7 +3550,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) cr3 = __sme_set(root); if (npt_enabled) { svm->vmcb->control.nested_cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_NPT); + vmcb_mark_dirty(svm->vmcb, VMCB_NPT); /* Loading L2's CR3 is handled by enter_svm_guest_mode. */ if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail)) @@ -3498,7 +3559,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) } svm->vmcb->save.cr3 = cr3; - mark_dirty(svm->vmcb, VMCB_CR); + vmcb_mark_dirty(svm->vmcb, VMCB_CR); } static int is_disabled(void) @@ -3551,7 +3612,7 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return 0; } -static void svm_cpuid_update(struct kvm_vcpu *vcpu) +static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3843,6 +3904,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) struct kvm_host_map map; u64 guest; u64 vmcb; + int ret = 0; guest = GET_SMSTATE(u64, smstate, 0x7ed8); vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); @@ -3851,10 +3913,11 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL) return 1; nested_vmcb = map.hva; - enter_svm_guest_mode(svm, vmcb, nested_vmcb); + ret = enter_svm_guest_mode(svm, vmcb, nested_vmcb); kvm_vcpu_unmap(&svm->vcpu, &map, true); } - return 0; + + return ret; } static void enable_smi_window(struct kvm_vcpu *vcpu) @@ -3863,7 +3926,7 @@ static void enable_smi_window(struct kvm_vcpu *vcpu) if (!gif_set(svm)) { if (vgif_enabled(svm)) - set_intercept(svm, INTERCEPT_STGI); + svm_set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ } else { /* We must be in SMM; RSM will cause a vmexit anyway. */ @@ -3992,7 +4055,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .vcpu_blocking = svm_vcpu_blocking, .vcpu_unblocking = svm_vcpu_unblocking, - .update_bp_intercept = update_bp_intercept, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = svm_get_msr_feature, .get_msr = svm_get_msr, .set_msr = svm_set_msr, @@ -4049,12 +4112,11 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_tss_addr = svm_set_tss_addr, .set_identity_map_addr = svm_set_identity_map_addr, - .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, - .cpuid_update = svm_cpuid_update, + .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, .has_wbinvd_exit = svm_has_wbinvd_exit, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 6ac4c00a5d82..a798e1731709 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -81,7 +81,7 @@ struct kvm_svm { struct kvm_vcpu; -struct nested_state { +struct svm_nested_state { struct vmcb *hsave; u64 hsave_msr; u64 vm_cr_msr; @@ -133,7 +133,7 @@ struct vcpu_svm { ulong nmi_iret_rip; - struct nested_state nested; + struct svm_nested_state nested; bool nmi_singlestep; u64 nmi_singlestep_guest_rflags; @@ -158,9 +158,6 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; - - /* which host CPU was used for running this vcpu */ - unsigned int last_cpu; }; struct svm_cpu_data { @@ -188,18 +185,18 @@ static inline struct kvm_svm *to_kvm_svm(struct kvm *kvm) return container_of(kvm, struct kvm_svm, kvm); } -static inline void mark_all_dirty(struct vmcb *vmcb) +static inline void vmcb_mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; } -static inline void mark_all_clean(struct vmcb *vmcb) +static inline void vmcb_mark_all_clean(struct vmcb *vmcb) { vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) & ~VMCB_ALWAYS_DIRTY_MASK; } -static inline void mark_dirty(struct vmcb *vmcb, int bit) +static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit) { vmcb->control.clean &= ~(1 << bit); } @@ -293,7 +290,7 @@ static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline void set_intercept(struct vcpu_svm *svm, int bit) +static inline void svm_set_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); @@ -302,7 +299,7 @@ static inline void set_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline void clr_intercept(struct vcpu_svm *svm, int bit) +static inline void svm_clr_intercept(struct vcpu_svm *svm, int bit) { struct vmcb *vmcb = get_host_vmcb(svm); @@ -311,7 +308,7 @@ static inline void clr_intercept(struct vcpu_svm *svm, int bit) recalc_intercepts(svm); } -static inline bool is_intercept(struct vcpu_svm *svm, int bit) +static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit) { return (svm->vmcb->control.intercept & (1ULL << bit)) != 0; } @@ -346,7 +343,10 @@ static inline bool gif_set(struct vcpu_svm *svm) } /* svm.c */ -#define MSR_INVALID 0xffffffffU +#define MSR_CR3_LEGACY_RESERVED_MASK 0xfe7U +#define MSR_CR3_LEGACY_PAE_RESERVED_MASK 0x7U +#define MSR_CR3_LONG_RESERVED_MASK 0xfff0000000000fe7U +#define MSR_INVALID 0xffffffffU u32 svm_msrpm_offset(u32 msr); void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); @@ -365,7 +365,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ #define NESTED_EXIT_CONTINUE 2 /* Further checks needed */ -static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) +static inline bool nested_svm_virtualize_tpr(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -387,8 +387,8 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm) return (svm->nested.ctl.intercept & (1ULL << INTERCEPT_NMI)); } -void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, - struct vmcb *nested_vmcb); +int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, + struct vmcb *nested_vmcb); void svm_leave_nested(struct vcpu_svm *svm); int nested_svm_vmrun(struct vcpu_svm *svm); void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb); @@ -420,7 +420,7 @@ extern int avic; static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) { svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - mark_dirty(svm->vmcb, VMCB_AVIC); + vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); } static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index bf944334003a..1ec1ac40e328 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * __svm_vcpu_run - Run a vCPU via a transition to SVM guest mode diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11e4df560018..23b58c28a1c9 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -171,15 +171,6 @@ static int nested_vmx_failInvalid(struct kvm_vcpu *vcpu) static int nested_vmx_failValid(struct kvm_vcpu *vcpu, u32 vm_instruction_error) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - /* - * failValid writes the error number to the current VMCS, which - * can't be done if there isn't a current VMCS. - */ - if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) - return nested_vmx_failInvalid(vcpu); - vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu) & ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_SF | X86_EFLAGS_OF)) @@ -192,6 +183,20 @@ static int nested_vmx_failValid(struct kvm_vcpu *vcpu, return kvm_skip_emulated_instruction(vcpu); } +static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* + * failValid writes the error number to the current VMCS, which + * can't be done if there isn't a current VMCS. + */ + if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs) + return nested_vmx_failInvalid(vcpu); + + return nested_vmx_failValid(vcpu, vm_instruction_error); +} + static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator) { /* TODO: not to reset guest simply here. */ @@ -2157,7 +2162,8 @@ static void prepare_vmcs02_constant_state(struct vcpu_vmx *vmx) * consistency checks. */ if (enable_ept && nested_early_check) - vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0)); + vmcs_write64(EPT_POINTER, + construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL)); /* All VMFUNCs are currently emulated through L0 vmexits. */ if (cpu_has_vmx_vmfunc()) @@ -2433,22 +2439,28 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0 + * doesn't care about page faults then we should set all of these to + * L1's desires. However, if L0 does care about (some) page faults, it + * is not easy (if at all possible?) to merge L0 and L1's desires, we + * simply ask to exit on each and every L2 page fault. This is done by + * setting MASK=MATCH=0 and (see below) EB.PF=1. * Note that below we don't need special code to set EB.PF beyond the * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when * !enable_ept, EB.PF is 1, so the "or" will always be 1. */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); + if (vmx_need_pf_intercept(&vmx->vcpu)) { + /* + * TODO: if both L0 and L1 need the same MASK and MATCH, + * go ahead and use it? + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0); + } else { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match); + } if (cpu_has_vmx_apicv()) { vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); @@ -3205,6 +3217,43 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) return true; } +static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + struct vmcs12 *vmcs12; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gpa_t dst; + + if (WARN_ON_ONCE(!is_guest_mode(vcpu))) + return 0; + + if (WARN_ON_ONCE(vmx->nested.pml_full)) + return 1; + + /* + * Check if PML is enabled for the nested guest. Whether eptp bit 6 is + * set is already checked as part of A/D emulation. + */ + vmcs12 = get_vmcs12(vcpu); + if (!nested_cpu_has_pml(vmcs12)) + return 0; + + if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { + vmx->nested.pml_full = true; + return 1; + } + + gpa &= ~0xFFFull; + dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; + + if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, + offset_in_page(dst), sizeof(gpa))) + return 0; + + vmcs12->guest_pml_index--; + + return 0; +} + /* * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are @@ -3456,19 +3505,18 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * when using the merged vmcs02. */ if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS) - return nested_vmx_failValid(vcpu, - VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS); if (vmcs12->launch_state == launch) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); if (nested_vmx_check_controls(vcpu, vmcs12)) - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); if (nested_vmx_check_host_state(vcpu, vmcs12)) - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); /* * We're finally done with prerequisite checking, and can start with @@ -3517,7 +3565,7 @@ vmentry_failed: if (status == NVMX_VMENTRY_VMEXIT) return 1; WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL); - return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); } /* @@ -4460,7 +4508,7 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason, * flag and the VM-instruction error field of the VMCS * accordingly, and skip the emulated instruction. */ - (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); /* * Restore L1's host state to KVM's software model. We're here @@ -4760,8 +4808,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) } if (vmx->nested.vmxon) - return nested_vmx_failValid(vcpu, - VMXERR_VMXON_IN_VMX_ROOT_OPERATION); + return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) != VMXON_NEEDED_FEATURES) { @@ -4852,12 +4899,10 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) return r; if (!page_address_valid(vcpu, vmptr)) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_INVALID_ADDRESS); + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMCLEAR_VMXON_POINTER); + return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); /* * When Enlightened VMEntry is enabled on the calling CPU we treat @@ -4927,8 +4972,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) offset = vmcs_field_to_offset(field); if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field)) copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12); @@ -5031,8 +5075,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) offset = vmcs_field_to_offset(field); if (offset < 0) - return nested_vmx_failValid(vcpu, - VMXERR_UNSUPPORTED_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); /* * If the vCPU supports "VMWRITE to any supported field in the @@ -5040,8 +5083,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) */ if (vmcs_field_readonly(field) && !nested_cpu_has_vmwrite_any_field(vcpu)) - return nested_vmx_failValid(vcpu, - VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); + return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); /* * Ensure vmcs12 is up-to-date before any VMWRITE that dirties @@ -5116,12 +5158,10 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) return r; if (!page_address_valid(vcpu, vmptr)) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_INVALID_ADDRESS); + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); if (vmptr == vmx->nested.vmxon_ptr) - return nested_vmx_failValid(vcpu, - VMXERR_VMPTRLD_VMXON_POINTER); + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER); /* Forbid normal VMPTRLD if Enlightened version was used */ if (vmx->nested.hv_evmcs) @@ -5138,7 +5178,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) * given physical address won't match the required * VMCS12_REVISION identifier. */ - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5148,7 +5188,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) (new_vmcs12->hdr.shadow_vmcs && !nested_cpu_has_vmx_shadow_vmcs(vcpu))) { kvm_vcpu_unmap(vcpu, &map, false); - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); } @@ -5233,8 +5273,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, - VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) @@ -5255,7 +5294,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) switch (type) { case VMX_EPT_EXTENT_CONTEXT: if (!nested_vmx_check_eptp(vcpu, operand.eptp)) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; @@ -5315,7 +5354,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); /* according to the intel vmx instruction reference, the memory @@ -5329,7 +5368,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return vmx_handle_memory_failure(vcpu, r, &e); if (operand.vpid >> 16) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid02 = nested_get_vpid02(vcpu); @@ -5337,14 +5376,14 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: if (!operand.vpid || is_noncanonical_address(operand.gla, vcpu)) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_vcpu_addr(vpid02, operand.gla); break; case VMX_VPID_EXTENT_SINGLE_CONTEXT: case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: if (!operand.vpid) - return nested_vmx_failValid(vcpu, + return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); vpid_sync_context(vpid02); break; @@ -6333,7 +6372,8 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps) /* * secondary cpu-based controls. Do not include those that - * depend on CPUID bits, they are added later by vmx_cpuid_update. + * depend on CPUID bits, they are added later by + * vmx_vcpu_after_set_cpuid. */ if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, @@ -6514,6 +6554,7 @@ struct kvm_x86_nested_ops vmx_nested_ops = { .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_vmcs12_pages = nested_get_vmcs12_pages, + .write_log_dirty = nested_vmx_write_pml_buffer, .enable_evmcs = nested_enable_evmcs, .get_evmcs_version = nested_get_evmcs_version, }; diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h index 5f1ac002b4b6..692b0c31c9c8 100644 --- a/arch/x86/kvm/vmx/ops.h +++ b/arch/x86/kvm/vmx/ops.h @@ -146,7 +146,9 @@ do { \ : : op1 : "cc" : error, fault); \ return; \ error: \ + instrumentation_begin(); \ insn##_error(error_args); \ + instrumentation_end(); \ return; \ fault: \ kvm_spurious_fault(); \ @@ -161,7 +163,9 @@ do { \ : : op1, op2 : "cc" : error, fault); \ return; \ error: \ + instrumentation_begin(); \ insn##_error(error_args); \ + instrumentation_end(); \ return; \ fault: \ kvm_spurious_fault(); \ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index bdcce65c7a1d..a886a47daebd 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -180,9 +180,6 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; - case MSR_IA32_PERF_CAPABILITIES: - ret = 1; - break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || @@ -224,12 +221,6 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = pmu->global_ovf_ctrl; return 0; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) - return 1; - msr_info->data = vcpu->arch.perf_capabilities; - return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { @@ -289,14 +280,6 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; - case MSR_IA32_PERF_CAPABILITIES: - if (!msr_info->host_initiated) - return 1; - if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ? - (data & ~vmx_get_perf_capabilities()) : data) - return 1; - vcpu->arch.perf_capabilities = data; - return 0; default: if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index e0a182cb3cdd..799db084a336 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -27,7 +27,7 @@ #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE #endif - .text +.section .noinstr.text, "ax" /** * vmx_vmenter - VM-Enter the current loaded VMCS @@ -234,6 +234,9 @@ SYM_FUNC_START(__vmx_vcpu_run) jmp 1b SYM_FUNC_END(__vmx_vcpu_run) + +.section .text, "ax" + /** * vmread_error_trampoline - Trampoline from inline asm to vmread_error() * @field: VMCS field encoding that failed diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 559634b59d2a..46ba2e03a892 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -781,7 +781,7 @@ void update_exception_bitmap(struct kvm_vcpu *vcpu) eb |= 1u << BP_VECTOR; if (to_vmx(vcpu)->rmode.vm86_active) eb = ~0; - if (enable_ept) + if (!vmx_need_pf_intercept(vcpu)) eb &= ~(1u << PF_VECTOR); /* When we are running a nested L2 guest and L1 specified for it a @@ -1816,7 +1816,7 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) msr->data = vmx_get_perf_capabilities(); return 0; default: - return 1; + return KVM_MSR_RET_INVALID; } } @@ -2063,7 +2063,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) return 1; - if (data & ~kvm_spec_ctrl_valid_bits(vcpu)) + if (kvm_spec_ctrl_test_value(data)) return 1; vmx->spec_ctrl = data; @@ -2934,14 +2934,16 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu) { - u64 root_hpa = vcpu->arch.mmu->root_hpa; + struct kvm_mmu *mmu = vcpu->arch.mmu; + u64 root_hpa = mmu->root_hpa; /* No flush required if the current context is invalid. */ if (!VALID_PAGE(root_hpa)) return; if (enable_ept) - ept_sync_context(construct_eptp(vcpu, root_hpa)); + ept_sync_context(construct_eptp(vcpu, root_hpa, + mmu->shadow_root_level)); else if (!is_guest_mode(vcpu)) vpid_sync_context(to_vmx(vcpu)->vpid); else @@ -3064,26 +3066,19 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmx->emulation_required = emulation_required(vcpu); } -static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) +static int vmx_get_max_tdp_level(void) { - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) + if (cpu_has_vmx_ept_5levels()) return 5; return 4; } -static int get_ept_level(struct kvm_vcpu *vcpu) -{ - if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))) - return vmx_eptp_page_walk_level(nested_ept_get_eptp(vcpu)); - - return vmx_get_tdp_level(vcpu); -} - -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level) { u64 eptp = VMX_EPTP_MT_WB; - eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; + eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4; if (enable_ept_ad_bits && (!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu))) @@ -3093,7 +3088,8 @@ u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa) return eptp; } -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) +static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd, + int pgd_level) { struct kvm *kvm = vcpu->kvm; bool update_guest_cr3 = true; @@ -3101,7 +3097,7 @@ void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd) u64 eptp; if (enable_ept) { - eptp = construct_eptp(vcpu, pgd); + eptp = construct_eptp(vcpu, pgd, pgd_level); vmcs_write64(EPT_POINTER, eptp); if (kvm_x86_ops.tlb_remote_flush) { @@ -4356,6 +4352,16 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + /* + * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched + * between guest and host. In that case we only care about present + * faults. + */ + if (enable_ept) { + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, PFERR_PRESENT_MASK); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, PFERR_PRESENT_MASK); + } } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -4782,18 +4788,25 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; - vcpu->run->internal.ndata = 3; + vcpu->run->internal.ndata = 4; vcpu->run->internal.data[0] = vect_info; vcpu->run->internal.data[1] = intr_info; vcpu->run->internal.data[2] = error_code; + vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu; return 0; } if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); - /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); - return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); + if (enable_ept && !vcpu->arch.apf.host_apf_flags) { + /* + * EPT will cause page fault only if we need to + * detect illegal GPAs. + */ + kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code); + return 1; + } else + return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } ex_no = intr_info & INTR_INFO_VECTOR_MASK; @@ -5309,6 +5322,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK; vcpu->arch.exit_qualification = exit_qualification; + + /* + * Check that the GPA doesn't exceed physical memory limits, as that is + * a guest page fault. We have to emulate the instruction here, because + * if the illegal address is that of a paging structure, then + * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we + * would also use advanced VM-exit information for EPT violations to + * reconstruct the page fault error code. + */ + if (unlikely(kvm_mmu_is_illegal_gpa(vcpu, gpa))) + return kvm_emulate_instruction(vcpu, 0); + return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0); } @@ -6005,6 +6030,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = exit_reason; + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6013,6 +6039,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); + vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6039,6 +6066,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) vcpu->run->internal.data[3] = vmcs_read64(GUEST_PHYSICAL_ADDRESS); } + vcpu->run->internal.data[vcpu->run->internal.ndata++] = + vcpu->arch.last_vmentry_cpu; return 0; } @@ -6094,8 +6123,9 @@ unexpected_vmexit: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; - vcpu->run->internal.ndata = 1; + vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_reason; + vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; return 0; } @@ -6109,7 +6139,7 @@ unexpected_vmexit: * information but as all relevant affected CPUs have 32KiB L1D cache size * there is no point in doing so. */ -static void vmx_l1d_flush(struct kvm_vcpu *vcpu) +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu) { int size = PAGE_SIZE << L1D_CACHE_ORDER; @@ -6142,7 +6172,7 @@ static void vmx_l1d_flush(struct kvm_vcpu *vcpu) vcpu->stat.l1d_flush++; if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) { - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); return; } @@ -6628,7 +6658,7 @@ static void vmx_update_hv_timer(struct kvm_vcpu *vcpu) } } -void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) { if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) { vmx->loaded_vmcs->host_state.rsp = host_rsp; @@ -6650,6 +6680,63 @@ static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, + struct vcpu_vmx *vmx) +{ + /* + * VMENTER enables interrupts (host state), but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about + * it. This is the same logic as for exit_to_user_mode(). + * + * This ensures that e.g. latency analysis on the host observes + * guest mode as interrupt enabled. + * + * guest_enter_irqoff() informs context tracking about the + * transition to guest mode and if enabled adjusts RCU state + * accordingly. + */ + instrumentation_begin(); + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + instrumentation_end(); + + guest_enter_irqoff(); + lockdep_hardirqs_on(CALLER_ADDR0); + + /* L1D Flush includes CPU buffer clear to mitigate MDS */ + if (static_branch_unlikely(&vmx_l1d_should_flush)) + vmx_l1d_flush(vcpu); + else if (static_branch_unlikely(&mds_user_clear)) + mds_clear_cpu_buffers(); + + if (vcpu->arch.cr2 != native_read_cr2()) + native_write_cr2(vcpu->arch.cr2); + + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, + vmx->loaded_vmcs->launched); + + vcpu->arch.cr2 = native_read_cr2(); + + /* + * VMEXIT disables interrupts (host state), but tracing and lockdep + * have them in state 'on' as recorded before entering guest mode. + * Same as enter_from_user_mode(). + * + * guest_exit_irqoff() restores host context and reinstates RCU if + * enabled and required. + * + * This needs to be done before the below as native_read_msr() + * contains a tracepoint and x86_spec_ctrl_restore_host() calls + * into world and some more. + */ + lockdep_hardirqs_off(CALLER_ADDR0); + guest_exit_irqoff(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + instrumentation_end(); +} + static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { fastpath_t exit_fastpath; @@ -6724,19 +6811,8 @@ reenter_guest: */ x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); - /* L1D Flush includes CPU buffer clear to mitigate MDS */ - if (static_branch_unlikely(&vmx_l1d_should_flush)) - vmx_l1d_flush(vcpu); - else if (static_branch_unlikely(&mds_user_clear)) - mds_clear_cpu_buffers(); - - if (vcpu->arch.cr2 != read_cr2()) - write_cr2(vcpu->arch.cr2); - - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs, - vmx->loaded_vmcs->launched); - - vcpu->arch.cr2 = read_cr2(); + /* The actual VMENTER/EXIT is in the .noinstr.text section. */ + vmx_vcpu_enter_exit(vcpu, vmx); /* * We do not use IBRS in the kernel. If this vCPU has used the @@ -7229,7 +7305,7 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } -static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -7478,42 +7554,6 @@ static void vmx_flush_log_dirty(struct kvm *kvm) kvm_flush_pml_buffers(kvm); } -static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - struct vmcs12 *vmcs12; - struct vcpu_vmx *vmx = to_vmx(vcpu); - gpa_t dst; - - if (is_guest_mode(vcpu)) { - WARN_ON_ONCE(vmx->nested.pml_full); - - /* - * Check if PML is enabled for the nested guest. - * Whether eptp bit 6 is set is already checked - * as part of A/D emulation. - */ - vmcs12 = get_vmcs12(vcpu); - if (!nested_cpu_has_pml(vmcs12)) - return 0; - - if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) { - vmx->nested.pml_full = true; - return 1; - } - - gpa &= ~0xFFFull; - dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index; - - if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa, - offset_in_page(dst), sizeof(gpa))) - return 0; - - vmcs12->guest_pml_index--; - } - - return 0; -} - static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *memslot, gfn_t offset, unsigned long mask) @@ -7858,7 +7898,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_bp_intercept = update_exception_bitmap, + .update_exception_bitmap = update_exception_bitmap, .get_msr_feature = vmx_get_msr_feature, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, @@ -7918,12 +7958,11 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_tss_addr = vmx_set_tss_addr, .set_identity_map_addr = vmx_set_identity_map_addr, - .get_tdp_level = vmx_get_tdp_level, .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, - .cpuid_update = vmx_cpuid_update, + .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid, .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, @@ -7942,7 +7981,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .slot_disable_log_dirty = vmx_slot_disable_log_dirty, .flush_log_dirty = vmx_flush_log_dirty, .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked, - .write_log_dirty = vmx_write_pml_buffer, .pre_block = vmx_pre_block, .post_block = vmx_post_block, @@ -8070,7 +8108,7 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, ept_lpage_level); + kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT @@ -8265,6 +8303,13 @@ static int __init vmx_init(void) #endif vmx_check_vmcs12_offsets(); + /* + * Intel processors don't have problems with + * GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable + * it for VMX by default + */ + allow_smaller_maxphyaddr = true; + return 0; } module_init(vmx_init); diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 639798e4a6ca..26175a4759fa 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -11,6 +11,7 @@ #include "kvm_cache_regs.h" #include "ops.h" #include "vmcs.h" +#include "cpuid.h" extern const u32 vmx_msr_index[]; @@ -337,11 +338,11 @@ void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer); void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); -void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long cr3); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); +u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa, + int root_level); void update_exception_bitmap(struct kvm_vcpu *vcpu); void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); @@ -536,8 +537,6 @@ static inline struct vmcs *alloc_vmcs(bool shadow) GFP_KERNEL_ACCOUNT); } -u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); - static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) { vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio; @@ -550,6 +549,11 @@ static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx) SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE; } +static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu) +{ + return !enable_ept || cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits; +} + void dump_vmcs(void); #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 21d5e7a7ffd0..12ea77f99ff3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -188,6 +188,9 @@ static struct kvm_shared_msrs __percpu *shared_msrs; u64 __read_mostly host_efer; EXPORT_SYMBOL_GPL(host_efer); +bool __read_mostly allow_smaller_maxphyaddr; +EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); + static u64 __read_mostly host_xss; u64 __read_mostly supported_xss; EXPORT_SYMBOL_GPL(supported_xss); @@ -244,6 +247,29 @@ static struct kmem_cache *x86_fpu_cache; static struct kmem_cache *x86_emulator_cache; +/* + * When called, it means the previous get/set msr reached an invalid msr. + * Return 0 if we want to ignore/silent this failed msr access, or 1 if we want + * to fail the caller. + */ +static int kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, + u64 data, bool write) +{ + const char *op = write ? "wrmsr" : "rdmsr"; + + if (ignore_msrs) { + if (report_ignored_msrs) + vcpu_unimpl(vcpu, "ignored %s: 0x%x data 0x%llx\n", + op, msr, data); + /* Mask the error */ + return 0; + } else { + vcpu_debug_ratelimited(vcpu, "unhandled %s: 0x%x data 0x%llx\n", + op, msr, data); + return 1; + } +} + static struct kmem_cache *kvm_alloc_emulator_cache(void) { unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); @@ -380,7 +406,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible void kvm_spurious_fault(void) +asmlinkage __visible noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -776,6 +802,7 @@ EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG; unsigned long update_bits = X86_CR0_PG | X86_CR0_WP; cr0 |= X86_CR0_ET; @@ -793,9 +820,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) return 1; - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { + if (cr0 & X86_CR0_PG) { #ifdef CONFIG_X86_64 - if ((vcpu->arch.efer & EFER_LME)) { + if (!is_paging(vcpu) && (vcpu->arch.efer & EFER_LME)) { int cs_db, cs_l; if (!is_pae(vcpu)) @@ -805,8 +832,8 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu, - kvm_read_cr3(vcpu))) + if (is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) && + !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu))) return 1; } @@ -917,7 +944,7 @@ static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) vcpu->arch.xcr0 = xcr0; if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -932,37 +959,17 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) } EXPORT_SYMBOL_GPL(kvm_set_xcr); -#define __cr4_reserved_bits(__cpu_has, __c) \ -({ \ - u64 __reserved_bits = CR4_RESERVED_BITS; \ - \ - if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ - __reserved_bits |= X86_CR4_OSXSAVE; \ - if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ - __reserved_bits |= X86_CR4_SMEP; \ - if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ - __reserved_bits |= X86_CR4_SMAP; \ - if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ - __reserved_bits |= X86_CR4_FSGSBASE; \ - if (!__cpu_has(__c, X86_FEATURE_PKU)) \ - __reserved_bits |= X86_CR4_PKE; \ - if (!__cpu_has(__c, X86_FEATURE_LA57)) \ - __reserved_bits |= X86_CR4_LA57; \ - if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ - __reserved_bits |= X86_CR4_UMIP; \ - __reserved_bits; \ -}) - -static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { if (cr4 & cr4_reserved_bits) return -EINVAL; - if (cr4 & __cr4_reserved_bits(guest_cpuid_has, vcpu)) + if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) return -EINVAL; return 0; } +EXPORT_SYMBOL_GPL(kvm_valid_cr4); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { @@ -1001,7 +1008,7 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) kvm_mmu_reset_context(vcpu); if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); return 0; } @@ -1111,7 +1118,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 4: /* fall through */ case 6: - if (val & 0xffffffff00000000ULL) + if (!kvm_dr6_valid(val)) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); break; @@ -1390,8 +1397,7 @@ static int kvm_get_msr_feature(struct kvm_msr_entry *msr) rdmsrl_safe(msr->index, &msr->data); break; default: - if (kvm_x86_ops.get_msr_feature(msr)) - return 1; + return kvm_x86_ops.get_msr_feature(msr); } return 0; } @@ -1403,6 +1409,13 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) msr.index = index; r = kvm_get_msr_feature(&msr); + + if (r == KVM_MSR_RET_INVALID) { + /* Unconditionally clear the output for simplicity */ + *data = 0; + r = kvm_msr_ignored_check(vcpu, index, 0, false); + } + if (r) return r; @@ -1517,6 +1530,17 @@ static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, return kvm_x86_ops.set_msr(vcpu, &msr); } +static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 data, bool host_initiated) +{ + int ret = __kvm_set_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) + ret = kvm_msr_ignored_check(vcpu, index, data, true); + + return ret; +} + /* * Read the MSR specified by @index into @data. Select MSR specific fault * checks are bypassed if @host_initiated is %true. @@ -1538,15 +1562,29 @@ int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, return ret; } +static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, + u32 index, u64 *data, bool host_initiated) +{ + int ret = __kvm_get_msr(vcpu, index, data, host_initiated); + + if (ret == KVM_MSR_RET_INVALID) { + /* Unconditionally clear *data for simplicity */ + *data = 0; + ret = kvm_msr_ignored_check(vcpu, index, 0, false); + } + + return ret; +} + int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, false); + return kvm_get_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_get_msr); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) { - return __kvm_set_msr(vcpu, index, data, false); + return kvm_set_msr_ignored_check(vcpu, index, data, false); } EXPORT_SYMBOL_GPL(kvm_set_msr); @@ -1666,12 +1704,12 @@ EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); */ static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_get_msr(vcpu, index, data, true); + return kvm_get_msr_ignored_check(vcpu, index, data, true); } static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return __kvm_set_msr(vcpu, index, *data, true); + return kvm_set_msr_ignored_check(vcpu, index, *data, true); } #ifdef CONFIG_X86_64 @@ -2823,6 +2861,20 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.arch_capabilities = data; break; + case MSR_IA32_PERF_CAPABILITIES: { + struct kvm_msr_entry msr_ent = {.index = msr, .data = 0}; + + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) && kvm_get_msr_feature(&msr_ent)) + return 1; + if (data & ~msr_ent.data) + return 1; + + vcpu->arch.perf_capabilities = data; + + return 0; + } case MSR_EFER: return set_efer(vcpu, msr_info); case MSR_K7_HWCR: @@ -2882,7 +2934,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) return 1; vcpu->arch.ia32_misc_enable_msr = data; - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); } else { vcpu->arch.ia32_misc_enable_msr = data; } @@ -3067,17 +3119,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return xen_hvm_config(vcpu, data); if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", - msr, data); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, - "ignored wrmsr: 0x%x data 0x%llx\n", - msr, data); - break; - } + return KVM_MSR_RET_INVALID; } return 0; } @@ -3173,6 +3215,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.arch_capabilities; break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; + break; case MSR_IA32_POWER_CTL: msr_info->data = vcpu->arch.msr_ia32_power_ctl; break; @@ -3332,17 +3380,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info); - if (!ignore_msrs) { - vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", - msr_info->index); - return 1; - } else { - if (report_ignored_msrs) - vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", - msr_info->index); - msr_info->data = 0; - } - break; + return KVM_MSR_RET_INVALID; } return 0; } @@ -3477,6 +3515,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: case KVM_CAP_SET_GUEST_DEBUG: + case KVM_CAP_LAST_CPU: r = 1; break; case KVM_CAP_SYNC_REGS: @@ -3539,6 +3578,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; + case KVM_CAP_SMALLER_MAXPHYADDR: + r = (int) allow_smaller_maxphyaddr; + break; default: break; } @@ -8155,7 +8197,7 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_x86_ops.set_efer(vcpu, 0); #endif - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); kvm_mmu_reset_context(vcpu); } @@ -8507,7 +8549,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - guest_enter_irqoff(); fpregs_assert_state_consistent(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) @@ -8549,6 +8590,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); + vcpu->arch.last_vmentry_cpu = vcpu->cpu; vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; @@ -8569,7 +8611,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) local_irq_disable(); kvm_after_interrupt(vcpu); - guest_exit_irqoff(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; if (delta != S64_MIN) { @@ -9174,7 +9215,7 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops.set_cr4(vcpu, sregs->cr4); if (cpuid_update_needed) - kvm_update_cpuid(vcpu); + kvm_update_cpuid_runtime(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); if (is_pae_paging(vcpu)) { @@ -9278,7 +9319,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops.update_bp_intercept(vcpu); + kvm_x86_ops.update_exception_bitmap(vcpu); r = 0; @@ -9476,7 +9517,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) fx_init(vcpu); vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); - vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; @@ -10673,28 +10713,53 @@ bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_arch_no_poll); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu) + +int kvm_spec_ctrl_test_value(u64 value) { - uint64_t bits = SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD; + /* + * test that setting IA32_SPEC_CTRL to given value + * is allowed by the host processor + */ - /* The STIBP bit doesn't fault even if it's not advertised */ - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL) && - !boot_cpu_has(X86_FEATURE_AMD_IBRS)) - bits &= ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP); + u64 saved_value; + unsigned long flags; + int ret = 0; - if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL_SSBD) && - !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; - if (!boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && - !boot_cpu_has(X86_FEATURE_AMD_SSBD)) - bits &= ~SPEC_CTRL_SSBD; + local_irq_save(flags); - return bits; + if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value)) + ret = 1; + else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value)) + ret = 1; + else + wrmsrl(MSR_IA32_SPEC_CTRL, saved_value); + + local_irq_restore(flags); + + return ret; } -EXPORT_SYMBOL_GPL(kvm_spec_ctrl_valid_bits); +EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); + +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) +{ + struct x86_exception fault; + + if (!(error_code & PFERR_PRESENT_MASK) || + vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, error_code, &fault) != UNMAPPED_GVA) { + /* + * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page + * tables probably do not match the TLB. Just proceed + * with the error code that the processor gave. + */ + fault.vector = PF_VECTOR; + fault.error_code_valid = true; + fault.error_code = error_code; + fault.nested_page_fault = false; + fault.address = gva; + } + vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); +} +EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 6eb62e97e59f..995ab696dcf0 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -272,6 +272,7 @@ int kvm_mtrr_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int page_num); bool kvm_vector_hashing_enabled(void); +void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); @@ -360,10 +361,41 @@ static inline bool kvm_dr7_valid(u64 data) /* Bits [63:32] are reserved */ return !(data >> 32); } +static inline bool kvm_dr6_valid(u64 data) +{ + /* Bits [63:32] are reserved */ + return !(data >> 32); +} void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu); -u64 kvm_spec_ctrl_valid_bits(struct kvm_vcpu *vcpu); +int kvm_spec_ctrl_test_value(u64 value); +int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu); +#define KVM_MSR_RET_INVALID 2 + +#define __cr4_reserved_bits(__cpu_has, __c) \ +({ \ + u64 __reserved_bits = CR4_RESERVED_BITS; \ + \ + if (!__cpu_has(__c, X86_FEATURE_XSAVE)) \ + __reserved_bits |= X86_CR4_OSXSAVE; \ + if (!__cpu_has(__c, X86_FEATURE_SMEP)) \ + __reserved_bits |= X86_CR4_SMEP; \ + if (!__cpu_has(__c, X86_FEATURE_SMAP)) \ + __reserved_bits |= X86_CR4_SMAP; \ + if (!__cpu_has(__c, X86_FEATURE_FSGSBASE)) \ + __reserved_bits |= X86_CR4_FSGSBASE; \ + if (!__cpu_has(__c, X86_FEATURE_PKU)) \ + __reserved_bits |= X86_CR4_PKE; \ + if (!__cpu_has(__c, X86_FEATURE_LA57)) \ + __reserved_bits |= X86_CR4_LA57; \ + if (!__cpu_has(__c, X86_FEATURE_UMIP)) \ + __reserved_bits |= X86_CR4_UMIP; \ + if (!__cpu_has(__c, X86_FEATURE_VMX)) \ + __reserved_bits |= X86_CR4_VMXE; \ + __reserved_bits; \ +}) + #endif diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 6deb49094c60..799f4eba0a62 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -114,9 +114,8 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); */ void __init xen_init_spinlocks(void) { - /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1) + if (num_possible_cpus() == 1 || nopvspin) xen_pvspin = false; if (!xen_pvspin) { @@ -137,6 +136,7 @@ void __init xen_init_spinlocks(void) static __init int xen_parse_nopvspin(char *arg) { + pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n"); xen_pvspin = false; return 0; } diff --git a/include/asm-generic/kvm_types.h b/include/asm-generic/kvm_types.h new file mode 100644 index 000000000000..2a82daf110f1 --- /dev/null +++ b/include/asm-generic/kvm_types.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_KVM_TYPES_H +#define _ASM_GENERIC_KVM_TYPES_H + +#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ac83e9c1d82c..a23076765b4c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -211,8 +211,8 @@ struct kvm_async_pf { void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu); void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu); -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - unsigned long hva, struct kvm_arch_async_pf *arch); +bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); #endif @@ -774,6 +774,7 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn); +bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn); void mark_page_dirty(struct kvm *kvm, gfn_t gfn); @@ -816,6 +817,13 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible); void kvm_flush_remote_tlbs(struct kvm *kvm); void kvm_reload_remote_mmus(struct kvm *kvm); +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min); +int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc); +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); +void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); +#endif + bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, struct kvm_vcpu *except, unsigned long *vcpu_bitmap, cpumask_var_t tmp); diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index 68e84cf42a3f..a7580f69dda0 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -20,6 +20,8 @@ enum kvm_mr_change; #include +#include + /* * Address types: * @@ -58,4 +60,21 @@ struct gfn_to_pfn_cache { bool dirty; }; +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +/* + * Memory caches are used to preallocate memory ahead of various MMU flows, + * e.g. page fault handlers. Gracefully handling allocation failures deep in + * MMU flows is problematic, as is triggering reclaim, I/O, etc... while + * holding MMU locks. Note, these caches act more like prefetch buffers than + * classical caches, i.e. objects are not returned to the cache on being freed. + */ +struct kvm_mmu_memory_cache { + int nobjs; + gfp_t gfp_zero; + struct kmem_cache *kmem_cache; + void *objects[KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE]; +}; +#endif + + #endif /* __KVM_TYPES_H__ */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 4fdf30316582..f6d86033c4fa 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -289,6 +289,7 @@ struct kvm_run { /* KVM_EXIT_FAIL_ENTRY */ struct { __u64 hardware_entry_failure_reason; + __u32 cpu; } fail_entry; /* KVM_EXIT_EXCEPTION */ struct { @@ -1031,6 +1032,9 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 #define KVM_CAP_ASYNC_PF_INT 183 +#define KVM_CAP_LAST_CPU 184 +#define KVM_CAP_SMALLER_MAXPHYADDR 185 +#define KVM_CAP_S390_DIAG318 186 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index b9515fcc9b29..cbff6ba53d56 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -581,4 +581,11 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" +bool nopvspin __initdata; +static __init int parse_nopvspin(char *arg) +{ + nopvspin = true; + return 0; +} +early_param("nopvspin", parse_nopvspin); #endif diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 45799606bb3e..390f758d5a27 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -156,17 +156,21 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) } } -int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - unsigned long hva, struct kvm_arch_async_pf *arch) +/* + * Try to schedule a job to handle page fault asynchronously. Returns 'true' on + * success, 'false' on failure (page fault has to be handled synchronously). + */ +bool kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, + unsigned long hva, struct kvm_arch_async_pf *arch) { struct kvm_async_pf *work; if (vcpu->async_pf.queued >= ASYNC_PF_PER_VCPU) - return 0; + return false; /* Arch specific code should not do async PF in this case */ if (unlikely(kvm_is_error_hva(hva))) - return 0; + return false; /* * do alloc nowait since if we are going to sleep anyway we @@ -174,7 +178,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, */ work = kmem_cache_zalloc(async_pf_cache, GFP_NOWAIT | __GFP_NOWARN); if (!work) - return 0; + return false; work->wakeup_all = false; work->vcpu = vcpu; @@ -193,7 +197,7 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, schedule_work(&work->work); - return 1; + return true; } int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0a68c9d3d3ab..2c2c0254c2d8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -340,6 +340,61 @@ void kvm_reload_remote_mmus(struct kvm *kvm) kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); } +#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE +static inline void *mmu_memory_cache_alloc_obj(struct kvm_mmu_memory_cache *mc, + gfp_t gfp_flags) +{ + gfp_flags |= mc->gfp_zero; + + if (mc->kmem_cache) + return kmem_cache_alloc(mc->kmem_cache, gfp_flags); + else + return (void *)__get_free_page(gfp_flags); +} + +int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache *mc, int min) +{ + void *obj; + + if (mc->nobjs >= min) + return 0; + while (mc->nobjs < ARRAY_SIZE(mc->objects)) { + obj = mmu_memory_cache_alloc_obj(mc, GFP_KERNEL_ACCOUNT); + if (!obj) + return mc->nobjs >= min ? 0 : -ENOMEM; + mc->objects[mc->nobjs++] = obj; + } + return 0; +} + +int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc) +{ + return mc->nobjs; +} + +void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) +{ + while (mc->nobjs) { + if (mc->kmem_cache) + kmem_cache_free(mc->kmem_cache, mc->objects[--mc->nobjs]); + else + free_page((unsigned long)mc->objects[--mc->nobjs]); + } +} + +void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc) +{ + void *p; + + if (WARN_ON(!mc->nobjs)) + p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC | __GFP_ACCOUNT); + else + p = mc->objects[--mc->nobjs]; + BUG_ON(!p); + return p; +} +#endif + static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) { mutex_init(&vcpu->mutex); @@ -1626,6 +1681,14 @@ bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_is_visible_gfn); +bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + + return kvm_is_visible_memslot(memslot); +} +EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn); + unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn) { struct vm_area_struct *vma;