Second round of changes for KVM for arm/arm64 for v3.19; fixes reboot

problems, clarifies VCPU init, and fixes a regression concerning the VGIC init flow. -----BEGIN PGP SIGNATURE----- Version: GnuPG v1 iQEcBAABAgAGBQJUjsVhAAoJEEtpOizt6ddy5rIH/1V/YVwhprC55YqdHelU9Qu2 Muzsx+7F71NxC7xgMGFqPD1YrPR+hxvoPhy+ADOBlvcqlolrkDnV9I+8e3geaYNc nZ/yEnoGTtbAggiS1smx7usBv34Z88Sd5txNjmj1cmHBy+VOWlyidWMkGBTsfBRe mVc61BDUfyC47udgRHXhwS80sbHLJHElmADisFOVmQNBYwwiHiTdx0hMBMnHcC3Y /3T0tKxHdeTISnmA+J+n7TcChtTIM4xqC6kwf3rw3b7XX8gdtTKylDHX2GLAg646 RdebAG2twmGpIc6SxXZbo38f3oY9OFo1Le5xZGa6iUjD56VDw/e4wg4iA2juo0Y= =J2Ut -----END PGP SIGNATURE----- Merge tag 'kvm-arm-for-3.19-take2' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD Second round of changes for KVM for arm/arm64 for v3.19; fixes reboot problems, clarifies VCPU init, and fixes a regression concerning the VGIC init flow. Conflicts: arch/ia64/kvm/kvm-ia64.c [deleted in HEAD and modified in kvmarm]
2014-12-15 13:06:40 +01:00 · 2014-12-15 13:06:40 +01:00 · 333bce5aac
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@ -2478,9 +2478,15 @@ return ENOEXEC for that vcpu.
 Note that because some registers reflect machine topology, all vcpus
 should be created before this ioctl is invoked.

+Userspace can call this function multiple times for a given vcpu, including
+after the vcpu has been run. This will reset the vcpu to its initial
+state. All calls to this function after the initial call must use the same
+target and same set of feature flags, otherwise EINVAL will be returned.
+
 Possible features:
 	- KVM_ARM_VCPU_POWER_OFF: Starts the CPU in a power-off state.
-	  Depends on KVM_CAP_ARM_PSCI.
+	  Depends on KVM_CAP_ARM_PSCI.  If not set, the CPU will be powered on
+	  and execute guest code when KVM_RUN is called.
 	- KVM_ARM_VCPU_EL1_32BIT: Starts the CPU in a 32bit mode.
 	  Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
 	- KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 for the CPU.
@ -2976,6 +2982,15 @@ HVC instruction based PSCI call from the vcpu. The 'type' field describes
 the system-level event type. The 'flags' field describes architecture
 specific flags for the system-level event.

+Valid values for 'type' are:
+  KVM_SYSTEM_EVENT_SHUTDOWN -- the guest has requested a shutdown of the
+   VM. Userspace is not obliged to honour this, and if it does honour
+   this does not need to destroy the VM synchronously (ie it may call
+   KVM_RUN again before shutdown finally occurs).
+  KVM_SYSTEM_EVENT_RESET -- the guest has requested a reset of the VM.
+   As with SHUTDOWN, userspace can choose to ignore the request, or
+   to schedule the reset to occur in the future and may call KVM_RUN again.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
--- a/arch/arm/include/asm/kvm_emulate.h
+++ b/arch/arm/include/asm/kvm_emulate.h
@ -33,6 +33,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);

+static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.hcr = HCR_GUEST_MASK;
+}
+
 static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
 {
 	return 1;
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@ -150,8 +150,6 @@ struct kvm_vcpu_stat {
 	u32 halt_wakeup;
 };

-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			const struct kvm_vcpu_init *init);
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@ -52,6 +52,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);

+void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
@ -161,9 +162,10 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 }

 static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-					     unsigned long size)
+					     unsigned long size,
+					     bool ipa_uncached)
 {
-	if (!vcpu_has_cache_enabled(vcpu))
+	if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
 		kvm_flush_dcache_to_poc((void *)hva, size);
 	
 	/*
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@ -213,6 +213,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 	int err;
 	struct kvm_vcpu *vcpu;

+	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) {
+		err = -EBUSY;
+		goto out;
+	}
+
 	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
 	if (!vcpu) {
 		err = -ENOMEM;
@ -263,6 +268,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 	/* Force users to call KVM_ARM_VCPU_INIT */
 	vcpu->arch.target = -1;
+	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);

 	/* Set up the timer */
 	kvm_timer_vcpu_init(vcpu);
@ -419,6 +425,7 @@ static void update_vttbr(struct kvm *kvm)

 static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 {
+	struct kvm *kvm = vcpu->kvm;
 	int ret;

 	if (likely(vcpu->arch.has_run_once))
@ -427,15 +434,23 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.has_run_once = true;

 	/*
-	 * Initialize the VGIC before running a vcpu the first time on
-	 * this VM.
+	 * Map the VGIC hardware resources before running a vcpu the first
+	 * time on this VM.
 	 */
-	if (unlikely(!vgic_initialized(vcpu->kvm))) {
-		ret = kvm_vgic_init(vcpu->kvm);
+	if (unlikely(!vgic_ready(kvm))) {
+		ret = kvm_vgic_map_resources(kvm);
 		if (ret)
 			return ret;
 	}

+	/*
+	 * Enable the arch timers only if we have an in-kernel VGIC
+	 * and it has been properly initialized, since we cannot handle
+	 * interrupts from the virtual timer with a userspace gic.
+	 */
+	if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
+		kvm_timer_enable(kvm);
+
 	return 0;
 }

@ -649,6 +664,48 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 	return -EINVAL;
 }

+static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
+			       const struct kvm_vcpu_init *init)
+{
+	unsigned int i;
+	int phys_target = kvm_target_cpu();
+
+	if (init->target != phys_target)
+		return -EINVAL;
+
+	/*
+	 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+	 * use the same target.
+	 */
+	if (vcpu->arch.target != -1 && vcpu->arch.target != init->target)
+		return -EINVAL;
+
+	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
+	for (i = 0; i < sizeof(init->features) * 8; i++) {
+		bool set = (init->features[i / 32] & (1 << (i % 32)));
+
+		if (set && i >= KVM_VCPU_MAX_FEATURES)
+			return -ENOENT;
+
+		/*
+		 * Secondary and subsequent calls to KVM_ARM_VCPU_INIT must
+		 * use the same feature set.
+		 */
+		if (vcpu->arch.target != -1 && i < KVM_VCPU_MAX_FEATURES &&
+		    test_bit(i, vcpu->arch.features) != set)
+			return -EINVAL;
+
+		if (set)
+			set_bit(i, vcpu->arch.features);
+	}
+
+	vcpu->arch.target = phys_target;
+
+	/* Now we know what it is, we can reset it. */
+	return kvm_reset_vcpu(vcpu);
+}
+
+
 static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 					 struct kvm_vcpu_init *init)
 {
@ -658,11 +715,22 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	if (ret)
 		return ret;

+	/*
+	 * Ensure a rebooted VM will fault in RAM pages and detect if the
+	 * guest MMU is turned off and flush the caches as needed.
+	 */
+	if (vcpu->arch.has_run_once)
+		stage2_unmap_vm(vcpu->kvm);
+
+	vcpu_reset_hcr(vcpu);
+
 	/*
 	 * Handle the "start in power-off" case by marking the VCPU as paused.
 	 */
-	if (__test_and_clear_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
+	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
 		vcpu->arch.pause = true;
+	else
+		vcpu->arch.pause = false;

 	return 0;
 }
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {

 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.hcr = HCR_GUEST_MASK;
 	return 0;
 }

@ -274,31 +273,6 @@ int __attribute_const__ kvm_target_cpu(void)
 	}
 }

-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			const struct kvm_vcpu_init *init)
-{
-	unsigned int i;
-
-	/* We can only cope with guest==host and only on A15/A7 (for now). */
-	if (init->target != kvm_target_cpu())
-		return -EINVAL;
-
-	vcpu->arch.target = init->target;
-	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-
-	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-	for (i = 0; i < sizeof(init->features) * 8; i++) {
-		if (test_bit(i, (void *)init->features)) {
-			if (i >= KVM_VCPU_MAX_FEATURES)
-				return -ENOENT;
-			set_bit(i, vcpu->arch.features);
-		}
-	}
-
-	/* Now we know what it is, we can reset it. */
-	return kvm_reset_vcpu(vcpu);
-}
-
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
 {
 	int target = kvm_target_cpu();
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@ -187,15 +187,18 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	}

 	rt = vcpu->arch.mmio_decode.rt;
-	data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt), mmio.len);

-	trace_kvm_mmio((mmio.is_write) ? KVM_TRACE_MMIO_WRITE :
-					 KVM_TRACE_MMIO_READ_UNSATISFIED,
-			mmio.len, fault_ipa,
-			(mmio.is_write) ? data : 0);
+	if (mmio.is_write) {
+		data = vcpu_data_guest_to_host(vcpu, *vcpu_reg(vcpu, rt),
+					       mmio.len);

-	if (mmio.is_write)
+		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, mmio.len,
+			       fault_ipa, data);
 		mmio_write_buf(mmio.data, mmio.len, data);
+	} else {
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, mmio.len,
+			       fault_ipa, 0);
+	}

 	if (vgic_handle_mmio(vcpu, run, &mmio))
 		return 1;
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@ -611,6 +611,71 @@ static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
 	unmap_range(kvm, kvm->arch.pgd, start, size);
 }

+static void stage2_unmap_memslot(struct kvm *kvm,
+				 struct kvm_memory_slot *memslot)
+{
+	hva_t hva = memslot->userspace_addr;
+	phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
+	phys_addr_t size = PAGE_SIZE * memslot->npages;
+	hva_t reg_end = hva + size;
+
+	/*
+	 * A memory region could potentially cover multiple VMAs, and any holes
+	 * between them, so iterate over all of them to find out if we should
+	 * unmap any of them.
+	 *
+	 *     +--------------------------------------------+
+	 * +---------------+----------------+   +----------------+
+	 * |   : VMA 1     |      VMA 2     |   |    VMA 3  :    |
+	 * +---------------+----------------+   +----------------+
+	 *     |               memory region                |
+	 *     +--------------------------------------------+
+	 */
+	do {
+		struct vm_area_struct *vma = find_vma(current->mm, hva);
+		hva_t vm_start, vm_end;
+
+		if (!vma || vma->vm_start >= reg_end)
+			break;
+
+		/*
+		 * Take the intersection of this VMA with the memory region
+		 */
+		vm_start = max(hva, vma->vm_start);
+		vm_end = min(reg_end, vma->vm_end);
+
+		if (!(vma->vm_flags & VM_PFNMAP)) {
+			gpa_t gpa = addr + (vm_start - memslot->userspace_addr);
+			unmap_stage2_range(kvm, gpa, vm_end - vm_start);
+		}
+		hva = vm_end;
+	} while (hva < reg_end);
+}
+
+/**
+ * stage2_unmap_vm - Unmap Stage-2 RAM mappings
+ * @kvm: The struct kvm pointer
+ *
+ * Go through the memregions and unmap any reguler RAM
+ * backing memory already mapped to the VM.
+ */
+void stage2_unmap_vm(struct kvm *kvm)
+{
+	struct kvm_memslots *slots;
+	struct kvm_memory_slot *memslot;
+	int idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+
+	slots = kvm_memslots(kvm);
+	kvm_for_each_memslot(memslot, slots)
+		stage2_unmap_memslot(kvm, memslot);
+
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+}
+
 /**
 * kvm_free_stage2_pgd - free all stage-2 tables
 * @kvm:	The KVM struct pointer for the VM.
@ -834,6 +899,11 @@ static bool kvm_is_write_fault(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_dabt_iswrite(vcpu);
 }

+static bool kvm_is_device_pfn(unsigned long pfn)
+{
+	return !pfn_valid(pfn);
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  unsigned long fault_status)
@ -847,6 +917,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	struct vm_area_struct *vma;
 	pfn_t pfn;
 	pgprot_t mem_type = PAGE_S2;
+	bool fault_ipa_uncached;

 	write_fault = kvm_is_write_fault(vcpu);
 	if (fault_status == FSC_PERM && !write_fault) {
@ -904,7 +975,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (is_error_pfn(pfn))
 		return -EFAULT;

-	if (kvm_is_mmio_pfn(pfn))
+	if (kvm_is_device_pfn(pfn))
 		mem_type = PAGE_S2_DEVICE;

 	spin_lock(&kvm->mmu_lock);
@ -913,6 +984,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (!hugetlb && !force_pte)
 		hugetlb = transparent_hugepage_adjust(&pfn, &fault_ipa);

+	fault_ipa_uncached = memslot->flags & KVM_MEMSLOT_INCOHERENT;
+
 	if (hugetlb) {
 		pmd_t new_pmd = pfn_pmd(pfn, mem_type);
 		new_pmd = pmd_mkhuge(new_pmd);
@ -920,7 +993,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_s2pmd_writable(&new_pmd);
 			kvm_set_pfn_dirty(pfn);
 		}
-		coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE);
+		coherent_cache_guest_page(vcpu, hva & PMD_MASK, PMD_SIZE,
+					  fault_ipa_uncached);
 		ret = stage2_set_pmd_huge(kvm, memcache, fault_ipa, &new_pmd);
 	} else {
 		pte_t new_pte = pfn_pte(pfn, mem_type);
@ -928,7 +1002,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			kvm_set_s2pte_writable(&new_pte);
 			kvm_set_pfn_dirty(pfn);
 		}
-		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE);
+		coherent_cache_guest_page(vcpu, hva, PAGE_SIZE,
+					  fault_ipa_uncached);
 		ret = stage2_set_pte(kvm, memcache, fault_ipa, &new_pte,
 			pgprot_val(mem_type) == pgprot_val(PAGE_S2_DEVICE));
 	}
@ -1288,11 +1363,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		hva = vm_end;
 	} while (hva < reg_end);

-	if (ret) {
-		spin_lock(&kvm->mmu_lock);
+	spin_lock(&kvm->mmu_lock);
+	if (ret)
 		unmap_stage2_range(kvm, mem->guest_phys_addr, mem->memory_size);
-		spin_unlock(&kvm->mmu_lock);
-	}
+	else
+		stage2_flush_memslot(kvm, memslot);
+	spin_unlock(&kvm->mmu_lock);
 	return ret;
 }

@ -1304,6 +1380,15 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 			    unsigned long npages)
 {
+	/*
+	 * Readonly memslots are not incoherent with the caches by definition,
+	 * but in practice, they are used mostly to emulate ROMs or NOR flashes
+	 * that the guest may consider devices and hence map as uncached.
+	 * To prevent incoherency issues in these cases, tag all readonly
+	 * regions as incoherent.
+	 */
+	if (slot->flags & KVM_MEM_READONLY)
+		slot->flags |= KVM_MEMSLOT_INCOHERENT;
 	return 0;
 }

--- a/arch/arm/kvm/psci.c
+++ b/arch/arm/kvm/psci.c
@ -15,6 +15,7 @@
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

+#include <linux/preempt.h>
 #include <linux/kvm_host.h>
 #include <linux/wait.h>

@ -166,6 +167,23 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)

 static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 {
+	int i;
+	struct kvm_vcpu *tmp;
+
+	/*
+	 * The KVM ABI specifies that a system event exit may call KVM_RUN
+	 * again and may perform shutdown/reboot at a later time that when the
+	 * actual request is made.  Since we are implementing PSCI and a
+	 * caller of PSCI reboot and shutdown expects that the system shuts
+	 * down or reboots immediately, let's make sure that VCPUs are not run
+	 * after this call is handled and before the VCPUs have been
+	 * re-initialized.
+	 */
+	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
+		tmp->arch.pause = true;
+		kvm_vcpu_kick(tmp);
+	}
+
 	memset(&vcpu->run->system_event, 0, sizeof(vcpu->run->system_event));
 	vcpu->run->system_event.type = type;
 	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
--- a/arch/arm64/include/asm/kvm_emulate.h
+++ b/arch/arm64/include/asm/kvm_emulate.h
@ -38,6 +38,11 @@ void kvm_inject_undefined(struct kvm_vcpu *vcpu);
 void kvm_inject_dabt(struct kvm_vcpu *vcpu, unsigned long addr);
 void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);

+static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
+}
+
 static inline unsigned long *vcpu_pc(const struct kvm_vcpu *vcpu)
 {
 	return (unsigned long *)&vcpu_gp_regs(vcpu)->regs.pc;
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@ -165,8 +165,6 @@ struct kvm_vcpu_stat {
 	u32 halt_wakeup;
 };

-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			const struct kvm_vcpu_init *init);
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
@ -200,6 +198,7 @@ struct kvm_vcpu *kvm_arm_get_running_vcpu(void);
 struct kvm_vcpu * __percpu *kvm_get_running_vcpus(void);

 u64 kvm_call_hyp(void *hypfn, ...);
+void force_vm_exit(const cpumask_t *mask);

 int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		int exception_index);
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@ -83,6 +83,7 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
 void free_boot_hyp_pgd(void);
 void free_hyp_pgds(void);

+void stage2_unmap_vm(struct kvm *kvm);
 int kvm_alloc_stage2_pgd(struct kvm *kvm);
 void kvm_free_stage2_pgd(struct kvm *kvm);
 int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
@ -243,9 +244,10 @@ static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 }

 static inline void coherent_cache_guest_page(struct kvm_vcpu *vcpu, hva_t hva,
-					     unsigned long size)
+					     unsigned long size,
+					     bool ipa_uncached)
 {
-	if (!vcpu_has_cache_enabled(vcpu))
+	if (!vcpu_has_cache_enabled(vcpu) || ipa_uncached)
 		kvm_flush_dcache_to_poc((void *)hva, size);

 	if (!icache_is_aliasing()) {		/* PIPT */
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@ -38,7 +38,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {

 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
 	return 0;
 }

@ -297,31 +296,6 @@ int __attribute_const__ kvm_target_cpu(void)
 	return -EINVAL;
 }

-int kvm_vcpu_set_target(struct kvm_vcpu *vcpu,
-			const struct kvm_vcpu_init *init)
-{
-	unsigned int i;
-	int phys_target = kvm_target_cpu();
-
-	if (init->target != phys_target)
-		return -EINVAL;
-
-	vcpu->arch.target = phys_target;
-	bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
-
-	/* -ENOENT for unknown features, -EINVAL for invalid combinations. */
-	for (i = 0; i < sizeof(init->features) * 8; i++) {
-		if (init->features[i / 32] & (1 << (i % 32))) {
-			if (i >= KVM_VCPU_MAX_FEATURES)
-				return -ENOENT;
-			set_bit(i, vcpu->arch.features);
-		}
-	}
-
-	/* Now we know what it is, we can reset it. */
-	return kvm_reset_vcpu(vcpu);
-}
-
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
 {
 	int target = kvm_target_cpu();
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@ -629,7 +629,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
 	 * kvm mmu, before reclaiming the page, we should
 	 * unmap it from mmu first.
 	 */
-	WARN_ON(!kvm_is_mmio_pfn(pfn) && !page_count(pfn_to_page(pfn)));
+	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));

 	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
 		kvm_set_pfn_accessed(pfn);
@ -2460,7 +2460,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		spte |= PT_PAGE_SIZE_MASK;
 	if (tdp_enabled)
 		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
-			kvm_is_mmio_pfn(pfn));
+			kvm_is_reserved_pfn(pfn));

 	if (host_writable)
 		spte |= SPTE_HOST_WRITEABLE;
@ -2736,7 +2736,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
 	 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
 	 * here.
 	 */
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
+	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
 	    level == PT_PAGE_TABLE_LEVEL &&
 	    PageTransCompound(pfn_to_page(pfn)) &&
 	    !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@ -60,7 +60,8 @@ struct arch_timer_cpu {

 #ifdef CONFIG_KVM_ARM_TIMER
 int kvm_timer_hyp_init(void);
-int kvm_timer_init(struct kvm *kvm);
+void kvm_timer_enable(struct kvm *kvm);
+void kvm_timer_init(struct kvm *kvm);
 void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 			  const struct kvm_irq_level *irq);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
@ -77,11 +78,8 @@ static inline int kvm_timer_hyp_init(void)
 	return 0;
 };

-static inline int kvm_timer_init(struct kvm *kvm)
-{
-	return 0;
-}
-
+static inline void kvm_timer_enable(struct kvm *kvm) {}
+static inline void kvm_timer_init(struct kvm *kvm) {}
 static inline void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 					const struct kvm_irq_level *irq) {}
 static inline void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu) {}
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@ -274,7 +274,7 @@ struct kvm_exit_mmio;
 #ifdef CONFIG_KVM_ARM_VGIC
 int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_vgic_hyp_init(void);
-int kvm_vgic_init(struct kvm *kvm);
+int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm);
 void kvm_vgic_destroy(struct kvm *kvm);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
@ -287,7 +287,8 @@ bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		      struct kvm_exit_mmio *mmio);

 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k)	((k)->arch.vgic.ready)
+#define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
+#define vgic_ready(k)		((k)->arch.vgic.ready)

 int vgic_v2_probe(struct device_node *vgic_node,
 		  const struct vgic_ops **ops,
@ -321,7 +322,7 @@ static inline int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr,
 	return -ENXIO;
 }

-static inline int kvm_vgic_init(struct kvm *kvm)
+static inline int kvm_vgic_map_resources(struct kvm *kvm)
 {
 	return 0;
 }
@ -373,6 +374,11 @@ static inline bool vgic_initialized(struct kvm *kvm)
 {
 	return true;
 }
+
+static inline bool vgic_ready(struct kvm *kvm)
+{
+	return true;
+}
 #endif

 #endif
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@ -43,6 +43,7 @@
 * include/linux/kvm_h.
 */
 #define KVM_MEMSLOT_INVALID	(1UL << 16)
+#define KVM_MEMSLOT_INCOHERENT	(1UL << 17)

 /* Two fragments for cross MMIO pages. */
 #define KVM_MAX_MMIO_FRAGMENTS	2
@ -712,7 +713,7 @@ void kvm_arch_sync_events(struct kvm *kvm);
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);

-bool kvm_is_mmio_pfn(pfn_t pfn);
+bool kvm_is_reserved_pfn(pfn_t pfn);

 struct kvm_irq_ack_notifier {
 	struct hlist_node link;
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@ -61,12 +61,14 @@ static void timer_disarm(struct arch_timer_cpu *timer)

 static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
 {
+	int ret;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;

 	timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
-	kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-			    timer->irq->irq,
-			    timer->irq->level);
+	ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
+				  timer->irq->irq,
+				  timer->irq->level);
+	WARN_ON(ret);
 }

 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
@ -307,12 +309,24 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
 	timer_disarm(timer);
 }

-int kvm_timer_init(struct kvm *kvm)
+void kvm_timer_enable(struct kvm *kvm)
 {
-	if (timecounter && wqueue) {
-		kvm->arch.timer.cntvoff = kvm_phys_timer_read();
-		kvm->arch.timer.enabled = 1;
-	}
+	if (kvm->arch.timer.enabled)
+		return;

-	return 0;
+	/*
+	 * There is a potential race here between VCPUs starting for the first
+	 * time, which may be enabling the timer multiple times.  That doesn't
+	 * hurt though, because we're just setting a variable to the same
+	 * variable that it already was.  The important thing is that all
+	 * VCPUs have the enabled variable set, before entering the guest, if
+	 * the arch timers are enabled.
+	 */
+	if (timecounter && wqueue)
+		kvm->arch.timer.enabled = 1;
+}
+
+void kvm_timer_init(struct kvm *kvm)
+{
+	kvm->arch.timer.cntvoff = kvm_phys_timer_read();
 }
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@ -91,6 +91,7 @@
 #define ACCESS_WRITE_VALUE	(3 << 1)
 #define ACCESS_WRITE_MASK(x)	((x) & (3 << 1))

+static int vgic_init(struct kvm *kvm);
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
 static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static void vgic_update_state(struct kvm *kvm);
@ -1607,7 +1608,7 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 	}
 }

-static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
+static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 				  unsigned int irq_num, bool level)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
@ -1643,9 +1644,10 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 			vgic_dist_irq_clear_level(vcpu, irq_num);
 			if (!vgic_dist_irq_soft_pend(vcpu, irq_num))
 				vgic_dist_irq_clear_pending(vcpu, irq_num);
-		} else {
-			vgic_dist_irq_clear_pending(vcpu, irq_num);
 		}
+
+		ret = false;
+		goto out;
 	}

 	enabled = vgic_irq_is_enabled(vcpu, irq_num);
@ -1672,7 +1674,7 @@ static bool vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 out:
 	spin_unlock(&dist->lock);

-	return ret;
+	return ret ? cpuid : -EINVAL;
 }

 /**
@ -1692,11 +1694,26 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
 			bool level)
 {
-	if (likely(vgic_initialized(kvm)) &&
-	    vgic_update_irq_pending(kvm, cpuid, irq_num, level))
-		vgic_kick_vcpus(kvm);
+	int ret = 0;
+	int vcpu_id;

-	return 0;
+	if (unlikely(!vgic_initialized(kvm))) {
+		mutex_lock(&kvm->lock);
+		ret = vgic_init(kvm);
+		mutex_unlock(&kvm->lock);
+
+		if (ret)
+			goto out;
+	}
+
+	vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
+	if (vcpu_id >= 0) {
+		/* kick the specified vcpu */
+		kvm_vcpu_kick(kvm_get_vcpu(kvm, vcpu_id));
+	}
+
+out:
+	return ret;
 }

 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@ -1726,39 +1743,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)

 	int sz = (nr_irqs - VGIC_NR_PRIVATE_IRQS) / 8;
 	vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
-	vgic_cpu->vgic_irq_lr_map = kzalloc(nr_irqs, GFP_KERNEL);
+	vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);

 	if (!vgic_cpu->pending_shared || !vgic_cpu->vgic_irq_lr_map) {
 		kvm_vgic_vcpu_destroy(vcpu);
 		return -ENOMEM;
 	}

-	return 0;
-}
-
-/**
- * kvm_vgic_vcpu_init - Initialize per-vcpu VGIC state
- * @vcpu: pointer to the vcpu struct
- *
- * Initialize the vgic_cpu struct and vgic_dist struct fields pertaining to
- * this vcpu and enable the VGIC for this VCPU
- */
-static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	int i;
-
-	for (i = 0; i < dist->nr_irqs; i++) {
-		if (i < VGIC_NR_PPIS)
-			vgic_bitmap_set_irq_val(&dist->irq_enabled,
-						vcpu->vcpu_id, i, 1);
-		if (i < VGIC_NR_PRIVATE_IRQS)
-			vgic_bitmap_set_irq_val(&dist->irq_cfg,
-						vcpu->vcpu_id, i, VGIC_CFG_EDGE);
-
-		vgic_cpu->vgic_irq_lr_map[i] = LR_EMPTY;
-	}
+	memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs);

 	/*
 	 * Store the number of LRs per vcpu, so we don't have to go
@ -1767,7 +1759,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
 	 */
 	vgic_cpu->nr_lr = vgic->nr_lr;

-	vgic_enable(vcpu);
+	return 0;
 }

 void kvm_vgic_destroy(struct kvm *kvm)
@ -1798,20 +1790,21 @@ void kvm_vgic_destroy(struct kvm *kvm)
 	dist->irq_spi_cpu = NULL;
 	dist->irq_spi_target = NULL;
 	dist->irq_pending_on_cpu = NULL;
+	dist->nr_cpus = 0;
 }

 /*
 * Allocate and initialize the various data structures. Must be called
 * with kvm->lock held!
 */
-static int vgic_init_maps(struct kvm *kvm)
+static int vgic_init(struct kvm *kvm)
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct kvm_vcpu *vcpu;
 	int nr_cpus, nr_irqs;
-	int ret, i;
+	int ret, i, vcpu_id;

-	if (dist->nr_cpus)	/* Already allocated */
+	if (vgic_initialized(kvm))
 		return 0;

 	nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
@ -1859,16 +1852,28 @@ static int vgic_init_maps(struct kvm *kvm)
 	if (ret)
 		goto out;

-	kvm_for_each_vcpu(i, vcpu, kvm) {
+	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4)
+		vgic_set_target_reg(kvm, 0, i);
+
+	kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
 		ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
 		if (ret) {
 			kvm_err("VGIC: Failed to allocate vcpu memory\n");
 			break;
 		}
-	}

-	for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i += 4)
-		vgic_set_target_reg(kvm, 0, i);
+		for (i = 0; i < dist->nr_irqs; i++) {
+			if (i < VGIC_NR_PPIS)
+				vgic_bitmap_set_irq_val(&dist->irq_enabled,
+							vcpu->vcpu_id, i, 1);
+			if (i < VGIC_NR_PRIVATE_IRQS)
+				vgic_bitmap_set_irq_val(&dist->irq_cfg,
+							vcpu->vcpu_id, i,
+							VGIC_CFG_EDGE);
+		}
+
+		vgic_enable(vcpu);
+	}

 out:
 	if (ret)
@ -1878,25 +1883,23 @@ out:
 }

 /**
- * kvm_vgic_init - Initialize global VGIC state before running any VCPUs
+ * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
 * @kvm: pointer to the kvm struct
 *
 * Map the virtual CPU interface into the VM before running any VCPUs.  We
 * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.  Also
- * initialize the ITARGETSRn regs to 0 on the emulated distributor.
+ * virtual CPU interface address in the guest physical address space.
 */
-int kvm_vgic_init(struct kvm *kvm)
+int kvm_vgic_map_resources(struct kvm *kvm)
 {
-	struct kvm_vcpu *vcpu;
-	int ret = 0, i;
+	int ret = 0;

 	if (!irqchip_in_kernel(kvm))
 		return 0;

 	mutex_lock(&kvm->lock);

-	if (vgic_initialized(kvm))
+	if (vgic_ready(kvm))
 		goto out;

 	if (IS_VGIC_ADDR_UNDEF(kvm->arch.vgic.vgic_dist_base) ||
@ -1906,7 +1909,11 @@ int kvm_vgic_init(struct kvm *kvm)
 		goto out;
 	}

-	ret = vgic_init_maps(kvm);
+	/*
+	 * Initialize the vgic if this hasn't already been done on demand by
+	 * accessing the vgic state from userspace.
+	 */
+	ret = vgic_init(kvm);
 	if (ret) {
 		kvm_err("Unable to allocate maps\n");
 		goto out;
@ -1920,9 +1927,6 @@ int kvm_vgic_init(struct kvm *kvm)
 		goto out;
 	}

-	kvm_for_each_vcpu(i, vcpu, kvm)
-		kvm_vgic_vcpu_init(vcpu);
-
 	kvm->arch.vgic.ready = true;
 out:
 	if (ret)
@ -2167,7 +2171,7 @@ static int vgic_attr_regs_access(struct kvm_device *dev,

 	mutex_lock(&dev->kvm->lock);

-	ret = vgic_init_maps(dev->kvm);
+	ret = vgic_init(dev->kvm);
 	if (ret)
 		goto out;

@ -2289,7 +2293,7 @@ static int vgic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)

 		mutex_lock(&dev->kvm->lock);

-		if (vgic_initialized(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
+		if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
 			ret = -EBUSY;
 		else
 			dev->kvm->arch.vgic.nr_irqs = val;
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -107,10 +107,10 @@ EXPORT_SYMBOL_GPL(kvm_rebooting);

 static bool largepages_enabled = true;

-bool kvm_is_mmio_pfn(pfn_t pfn)
+bool kvm_is_reserved_pfn(pfn_t pfn)
 {
 	if (pfn_valid(pfn))
-		return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn));
+		return PageReserved(pfn_to_page(pfn));

 	return true;
 }
@ -1301,7 +1301,7 @@ static pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 	else if ((vma->vm_flags & VM_PFNMAP)) {
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
 			vma->vm_pgoff;
-		BUG_ON(!kvm_is_mmio_pfn(pfn));
+		BUG_ON(!kvm_is_reserved_pfn(pfn));
 	} else {
 		if (async && vma_is_valid(vma, write_fault))
 			*async = true;
@ -1407,7 +1407,7 @@ static struct page *kvm_pfn_to_page(pfn_t pfn)
 	if (is_error_noslot_pfn(pfn))
 		return KVM_ERR_PTR_BAD_PAGE;

-	if (kvm_is_mmio_pfn(pfn)) {
+	if (kvm_is_reserved_pfn(pfn)) {
 		WARN_ON(1);
 		return KVM_ERR_PTR_BAD_PAGE;
 	}
@ -1436,7 +1436,7 @@ EXPORT_SYMBOL_GPL(kvm_release_page_clean);

 void kvm_release_pfn_clean(pfn_t pfn)
 {
-	if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn))
+	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn))
 		put_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
@ -1457,7 +1457,7 @@ static void kvm_release_pfn_dirty(pfn_t pfn)

 void kvm_set_pfn_dirty(pfn_t pfn)
 {
-	if (!kvm_is_mmio_pfn(pfn)) {
+	if (!kvm_is_reserved_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 		if (!PageReserved(page))
 			SetPageDirty(page);
@ -1467,14 +1467,14 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);

 void kvm_set_pfn_accessed(pfn_t pfn)
 {
-	if (!kvm_is_mmio_pfn(pfn))
+	if (!kvm_is_reserved_pfn(pfn))
 		mark_page_accessed(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);

 void kvm_get_pfn(pfn_t pfn)
 {
-	if (!kvm_is_mmio_pfn(pfn))
+	if (!kvm_is_reserved_pfn(pfn))
 		get_page(pfn_to_page(pfn));
 }
 EXPORT_SYMBOL_GPL(kvm_get_pfn);