Fixes for PPC and s390.

-----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJc85ibAAoJEL/70l94x66D72gH/iaXjRF9uqGSnd1/JLHIawfb oH0VQS24tBzRlFREBTA68IxThgjTmSS+yHcAXSO7JmxztjGq3ZWiNaidQIvC1reu t4MJMvf7ZZa7Yq0OAy2jwVAkZMKk5P8hBjjI5N7pEBb4ApJHzsCHV+KEIe5loc+q f5LYLR53keImJ40wxh/qFftNNlYJUMv6tWa8y0mrlBrKABOvdRYFswhqcnEPibi9 cPoHDS6Ep/34eAVQzqHzfDbjezpa342SSw6s66Vpb/qYJyxoUh1Mw+9YCmAWanS8 vuvXz4qjCFvLRrmc9ctASUTEVydqx8IdcKQGiteWgpSrl4kgy6nLMZDY5sbq8UM= =Bgfn -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull KVM fixes from Paolo Bonzini: "Fixes for PPC and s390" * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: KVM: PPC: Book3S HV: Restore SPRG3 in kvmhv_p9_guest_entry() KVM: PPC: Book3S HV: Fix lockdep warning when entering guest on POWER9 KVM: PPC: Book3S HV: XIVE: Fix page offset when clearing ESB pages KVM: PPC: Book3S HV: XIVE: Take the srcu read lock when accessing memslots KVM: PPC: Book3S HV: XIVE: Do not clear IRQ data of passthrough interrupts KVM: PPC: Book3S HV: XIVE: Introduce a new mutex for the XIVE device KVM: PPC: Book3S HV: XIVE: Fix the enforced limit on the vCPU identifier KVM: PPC: Book3S HV: XIVE: Do not test the EQ flag validity when resetting KVM: PPC: Book3S HV: XIVE: Clear file mapping when device is released KVM: PPC: Book3S HV: Don't take kvm->lock around kvm_for_each_vcpu KVM: PPC: Book3S: Use new mutex to synchronize access to rtas token list KVM: PPC: Book3S HV: Use new mutex to synchronize MMU setup KVM: PPC: Book3S HV: Avoid touching arch.mmu_ready in XIVE release functions KVM: s390: Do not report unusabled IDs via KVM_CAP_MAX_VCPU_ID kvm: fix compile on s390 part 2
2019-06-02 10:19:39 -07:00 · 2019-06-02 10:19:39 -07:00 · b44a1dd3f6
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@ -1122,6 +1122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = KVM_MAX_VCPU_ID;
+		break;
 	case KVM_CAP_MIPS_FPU:
 		/* We don't handle systems with inconsistent cpu_has_fpu */
 		r = !!raw_cpu_has_fpu;
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@ -309,6 +309,7 @@ struct kvm_arch {
 #ifdef CONFIG_PPC_BOOK3S_64
 	struct list_head spapr_tce_tables;
 	struct list_head rtas_tokens;
+	struct mutex rtas_token_lock;
 	DECLARE_BITMAP(enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 #endif
 #ifdef CONFIG_KVM_MPIC
@ -325,6 +326,7 @@ struct kvm_arch {
 #endif
 	struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	struct mutex mmu_setup_lock;	/* nests inside vcpu mutexes */
 	u64 l1_ptcr;
 	int max_nested_lpid;
 	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@ -902,6 +902,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 #ifdef CONFIG_PPC64
 	INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
 	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
+	mutex_init(&kvm->arch.rtas_token_lock);
 #endif

 	return kvm->arch.kvm_ops->init_vm(kvm);
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@ -63,7 +63,7 @@ struct kvm_resize_hpt {
 	struct work_struct work;
 	u32 order;

-	/* These fields protected by kvm->lock */
+	/* These fields protected by kvm->arch.mmu_setup_lock */

 	/* Possible values and their usage:
 	 *  <0     an error occurred during allocation,
@ -73,7 +73,7 @@ struct kvm_resize_hpt {
 	int error;

 	/* Private to the work thread, until error != -EBUSY,
-	 * then protected by kvm->lock.
+	 * then protected by kvm->arch.mmu_setup_lock.
 	 */
 	struct kvm_hpt_info hpt;
 };
@ -139,7 +139,7 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 	long err = -EBUSY;
 	struct kvm_hpt_info info;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);
 	if (kvm->arch.mmu_ready) {
 		kvm->arch.mmu_ready = 0;
 		/* order mmu_ready vs. vcpus_running */
@ -183,7 +183,7 @@ out:
 		/* Ensure that each vcpu will flush its TLB on next entry. */
 		cpumask_setall(&kvm->arch.need_tlb_flush);

-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
 	return err;
 }

@ -1447,7 +1447,7 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)

 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
 {
-	if (WARN_ON(!mutex_is_locked(&kvm->lock)))
+	if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock)))
 		return;

 	if (!resize)
@ -1474,14 +1474,14 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 	if (WARN_ON(resize->error != -EBUSY))
 		return;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);

 	/* Request is still current? */
 	if (kvm->arch.resize_hpt == resize) {
 		/* We may request large allocations here:
-		 * do not sleep with kvm->lock held for a while.
+		 * do not sleep with kvm->arch.mmu_setup_lock held for a while.
 		 */
-		mutex_unlock(&kvm->lock);
+		mutex_unlock(&kvm->arch.mmu_setup_lock);

 		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
 				 resize->order);
@ -1494,9 +1494,9 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 		if (WARN_ON(err == -EBUSY))
 			err = -EINPROGRESS;

-		mutex_lock(&kvm->lock);
+		mutex_lock(&kvm->arch.mmu_setup_lock);
 		/* It is possible that kvm->arch.resize_hpt != resize
-		 * after we grab kvm->lock again.
+		 * after we grab kvm->arch.mmu_setup_lock again.
 		 */
 	}

@ -1505,7 +1505,7 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 	if (kvm->arch.resize_hpt != resize)
 		resize_hpt_release(kvm, resize);

-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
 }

 long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
@ -1522,7 +1522,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 	if (shift && ((shift < 18) || (shift > 46)))
 		return -EINVAL;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);

 	resize = kvm->arch.resize_hpt;

@ -1565,7 +1565,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 	ret = 100; /* estimated time in ms */

 out:
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
 	return ret;
 }

@ -1588,7 +1588,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 	if (shift && ((shift < 18) || (shift > 46)))
 		return -EINVAL;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);

 	resize = kvm->arch.resize_hpt;

@ -1625,7 +1625,7 @@ out:
 	smp_mb();
 out_no_hpt:
 	resize_hpt_release(kvm, resize);
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
 	return ret;
 }

@ -1868,7 +1868,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 		return -EINVAL;

 	/* lock out vcpus from running while we're doing this */
-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);
 	mmu_ready = kvm->arch.mmu_ready;
 	if (mmu_ready) {
 		kvm->arch.mmu_ready = 0;	/* temporarily */
@ -1876,7 +1876,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
 			kvm->arch.mmu_ready = 1;
-			mutex_unlock(&kvm->lock);
+			mutex_unlock(&kvm->arch.mmu_setup_lock);
 			return -EBUSY;
 		}
 	}
@ -1963,7 +1963,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	/* Order HPTE updates vs. mmu_ready */
 	smp_wmb();
 	kvm->arch.mmu_ready = mmu_ready;
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);

 	if (err)
 		return err;
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@ -446,12 +446,7 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)

 static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 {
-	struct kvm_vcpu *ret;
-
-	mutex_lock(&kvm->lock);
-	ret = kvm_get_vcpu_by_id(kvm, id);
-	mutex_unlock(&kvm->lock);
-	return ret;
+	return kvm_get_vcpu_by_id(kvm, id);
 }

 static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
@ -1583,7 +1578,6 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	u64 mask;

-	mutex_lock(&kvm->lock);
 	spin_lock(&vc->lock);
 	/*
 	 * If ILE (interrupt little-endian) has changed, update the
@ -1623,7 +1617,6 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
 		mask &= 0xFFFFFFFF;
 	vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
 	spin_unlock(&vc->lock);
-	mutex_unlock(&kvm->lock);
 }

 static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
@ -2338,11 +2331,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 			pr_devel("KVM: collision on id %u", id);
 			vcore = NULL;
 		} else if (!vcore) {
+			/*
+			 * Take mmu_setup_lock for mutual exclusion
+			 * with kvmppc_update_lpcr().
+			 */
 			err = -ENOMEM;
 			vcore = kvmppc_vcore_create(kvm,
 					id & ~(kvm->arch.smt_mode - 1));
+			mutex_lock(&kvm->arch.mmu_setup_lock);
 			kvm->arch.vcores[core] = vcore;
 			kvm->arch.online_vcores++;
+			mutex_unlock(&kvm->arch.mmu_setup_lock);
 		}
 	}
 	mutex_unlock(&kvm->lock);
@ -3663,6 +3662,7 @@ int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
 	vc->in_guest = 0;

 	mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+	mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);

 	kvmhv_load_host_pmu();

@ -3859,7 +3859,7 @@ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
 	int r = 0;
 	struct kvm *kvm = vcpu->kvm;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);
 	if (!kvm->arch.mmu_ready) {
 		if (!kvm_is_radix(kvm))
 			r = kvmppc_hv_setup_htab_rma(vcpu);
@ -3869,7 +3869,7 @@ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
 			kvm->arch.mmu_ready = 1;
 		}
 	}
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
 	return r;
 }

@ -4091,16 +4091,20 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
 		kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
 	}

-	trace_hardirqs_on();
 	guest_enter_irqoff();

 	srcu_idx = srcu_read_lock(&kvm->srcu);

 	this_cpu_disable_ftrace();

+	/* Tell lockdep that we're about to enable interrupts */
+	trace_hardirqs_on();
+
 	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
 	vcpu->arch.trap = trap;

+	trace_hardirqs_off();
+
 	this_cpu_enable_ftrace();

 	srcu_read_unlock(&kvm->srcu, srcu_idx);
@ -4110,7 +4114,6 @@ int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
 		isync();
 	}

-	trace_hardirqs_off();
 	set_irq_happened(trap);

 	kvmppc_set_host_core(pcpu);
@ -4478,7 +4481,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,

 /*
 * Update LPCR values in kvm->arch and in vcores.
- * Caller must hold kvm->lock.
+ * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
+ * of kvm->arch.lpcr update).
 */
 void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
 {
@ -4530,7 +4534,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)

 /*
 * Set up HPT (hashed page table) and RMA (real-mode area).
- * Must be called with kvm->lock held.
+ * Must be called with kvm->arch.mmu_setup_lock held.
 */
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 {
@ -4618,7 +4622,10 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	goto out_srcu;
 }

-/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+/*
+ * Must be called with kvm->arch.mmu_setup_lock held and
+ * mmu_ready = 0 and no vcpus running.
+ */
 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 {
 	if (nesting_enabled(kvm))
@ -4635,7 +4642,10 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 	return 0;
 }

-/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+/*
+ * Must be called with kvm->arch.mmu_setup_lock held and
+ * mmu_ready = 0 and no vcpus running.
+ */
 int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
 {
 	int err;
@ -4740,6 +4750,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	char buf[32];
 	int ret;

+	mutex_init(&kvm->arch.mmu_setup_lock);
+
 	/* Allocate the guest's logical partition ID */

 	lpid = kvmppc_alloc_lpid();
@ -5265,7 +5277,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	if (kvmhv_on_pseries() && !radix)
 		return -EINVAL;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.mmu_setup_lock);
 	if (radix != kvm_is_radix(kvm)) {
 		if (kvm->arch.mmu_ready) {
 			kvm->arch.mmu_ready = 0;
@ -5293,7 +5305,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 	err = 0;

 out_unlock:
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.mmu_setup_lock);
 	return err;
 }

--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@ -146,7 +146,7 @@ static int rtas_token_undefine(struct kvm *kvm, char *name)
 {
 	struct rtas_token_definition *d, *tmp;

-	lockdep_assert_held(&kvm->lock);
+	lockdep_assert_held(&kvm->arch.rtas_token_lock);

 	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
 		if (rtas_name_matches(d->handler->name, name)) {
@ -167,7 +167,7 @@ static int rtas_token_define(struct kvm *kvm, char *name, u64 token)
 	bool found;
 	int i;

-	lockdep_assert_held(&kvm->lock);
+	lockdep_assert_held(&kvm->arch.rtas_token_lock);

 	list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
 		if (d->token == token)
@ -206,14 +206,14 @@ int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp)
 	if (copy_from_user(&args, argp, sizeof(args)))
 		return -EFAULT;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&kvm->arch.rtas_token_lock);

 	if (args.token)
 		rc = rtas_token_define(kvm, args.name, args.token);
 	else
 		rc = rtas_token_undefine(kvm, args.name);

-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&kvm->arch.rtas_token_lock);

 	return rc;
 }
@ -245,7 +245,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
 	orig_rets = args.rets;
 	args.rets = &args.args[be32_to_cpu(args.nargs)];

-	mutex_lock(&vcpu->kvm->lock);
+	mutex_lock(&vcpu->kvm->arch.rtas_token_lock);

 	rc = -ENOENT;
 	list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
@ -256,7 +256,7 @@ int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu)
 		}
 	}

-	mutex_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&vcpu->kvm->arch.rtas_token_lock);

 	if (rc == 0) {
 		args.rets = orig_rets;
@ -282,8 +282,6 @@ void kvmppc_rtas_tokens_free(struct kvm *kvm)
 {
 	struct rtas_token_definition *d, *tmp;

-	lockdep_assert_held(&kvm->lock);
-
 	list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
 		list_del(&d->list);
 		kfree(d);
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@ -271,14 +271,14 @@ static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
 	return rc;
 }

-/* Called with kvm_lock held */
+/* Called with xive->lock held */
 static int xive_check_provisioning(struct kvm *kvm, u8 prio)
 {
 	struct kvmppc_xive *xive = kvm->arch.xive;
 	struct kvm_vcpu *vcpu;
 	int i, rc;

-	lockdep_assert_held(&kvm->lock);
+	lockdep_assert_held(&xive->lock);

 	/* Already provisioned ? */
 	if (xive->qmap & (1 << prio))
@ -621,9 +621,12 @@ int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
 		 irq, server, priority);

 	/* First, check provisioning of queues */
-	if (priority != MASKED)
+	if (priority != MASKED) {
+		mutex_lock(&xive->lock);
 		rc = xive_check_provisioning(xive->kvm,
 			      xive_prio_from_guest(priority));
+		mutex_unlock(&xive->lock);
+	}
 	if (rc) {
 		pr_devel("  provisioning failure %d !\n", rc);
 		return rc;
@ -1199,7 +1202,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 		return -ENOMEM;

 	/* We need to synchronize with queue provisioning */
-	mutex_lock(&vcpu->kvm->lock);
+	mutex_lock(&xive->lock);
 	vcpu->arch.xive_vcpu = xc;
 	xc->xive = xive;
 	xc->vcpu = vcpu;
@ -1283,7 +1286,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);

 bail:
-	mutex_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&xive->lock);
 	if (r) {
 		kvmppc_xive_cleanup_vcpu(vcpu);
 		return r;
@ -1527,13 +1530,12 @@ static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
 struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
 	struct kvmppc_xive *xive, int irq)
 {
-	struct kvm *kvm = xive->kvm;
 	struct kvmppc_xive_src_block *sb;
 	int i, bid;

 	bid = irq >> KVMPPC_XICS_ICS_SHIFT;

-	mutex_lock(&kvm->lock);
+	mutex_lock(&xive->lock);

 	/* block already exists - somebody else got here first */
 	if (xive->src_blocks[bid])
@ -1560,7 +1562,7 @@ struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
 		xive->max_sbid = bid;

 out:
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&xive->lock);
 	return xive->src_blocks[bid];
 }

@ -1670,9 +1672,9 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 	/* If we have a priority target the interrupt */
 	if (act_prio != MASKED) {
 		/* First, check provisioning of queues */
-		mutex_lock(&xive->kvm->lock);
+		mutex_lock(&xive->lock);
 		rc = xive_check_provisioning(xive->kvm, act_prio);
-		mutex_unlock(&xive->kvm->lock);
+		mutex_unlock(&xive->lock);

 		/* Target interrupt */
 		if (rc == 0)
@ -1826,7 +1828,6 @@ static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
 {
 	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
 	xive_native_configure_irq(hw_num, 0, MASKED, 0);
-	xive_cleanup_irq_data(xd);
 }

 void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
@ -1840,9 +1841,10 @@ void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
 			continue;

 		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+		xive_cleanup_irq_data(&state->ipi_data);
 		xive_native_free_irq(state->ipi_number);

-		/* Pass-through, cleanup too */
+		/* Pass-through, cleanup too but keep IRQ hw data */
 		if (state->pt_number)
 			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);

@ -1859,21 +1861,10 @@ static void kvmppc_xive_release(struct kvm_device *dev)
 	struct kvm *kvm = xive->kvm;
 	struct kvm_vcpu *vcpu;
 	int i;
-	int was_ready;

 	pr_devel("Releasing xive device\n");

-	debugfs_remove(xive->dentry);
-
 	/*
-	 * Clearing mmu_ready temporarily while holding kvm->lock
-	 * is a way of ensuring that no vcpus can enter the guest
-	 * until we drop kvm->lock.  Doing kick_all_cpus_sync()
-	 * ensures that any vcpu executing inside the guest has
-	 * exited the guest.  Once kick_all_cpus_sync() has finished,
-	 * we know that no vcpu can be executing the XIVE push or
-	 * pull code, or executing a XICS hcall.
-	 *
 	 * Since this is the device release function, we know that
 	 * userspace does not have any open fd referring to the
 	 * device.  Therefore there can not be any of the device
@ -1881,9 +1872,8 @@ static void kvmppc_xive_release(struct kvm_device *dev)
 	 * and similarly, the connect_vcpu and set/clr_mapped
 	 * functions also cannot be being executed.
 	 */
-	was_ready = kvm->arch.mmu_ready;
-	kvm->arch.mmu_ready = 0;
-	kick_all_cpus_sync();
+
+	debugfs_remove(xive->dentry);

 	/*
 	 * We should clean up the vCPU interrupt presenters first.
@ -1892,12 +1882,22 @@ static void kvmppc_xive_release(struct kvm_device *dev)
 		/*
 		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
 		 * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
+		 * Holding the vcpu->mutex also means that the vcpu cannot
+		 * be executing the KVM_RUN ioctl, and therefore it cannot
+		 * be executing the XIVE push or pull code or accessing
+		 * the XIVE MMIO regions.
 		 */
 		mutex_lock(&vcpu->mutex);
 		kvmppc_xive_cleanup_vcpu(vcpu);
 		mutex_unlock(&vcpu->mutex);
 	}

+	/*
+	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+	 * against xive code getting called during vcpu execution or
+	 * set/get one_reg operations.
+	 */
 	kvm->arch.xive = NULL;

 	/* Mask and free interrupts */
@ -1911,8 +1911,6 @@ static void kvmppc_xive_release(struct kvm_device *dev)
 	if (xive->vp_base != XIVE_INVALID_VP)
 		xive_native_free_vp_block(xive->vp_base);

-	kvm->arch.mmu_ready = was_ready;
-
 	/*
 	 * A reference of the kvmppc_xive pointer is now kept under
 	 * the xive_devices struct of the machine for reuse. It is
@ -1967,6 +1965,7 @@ static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
 	dev->private = xive;
 	xive->dev = dev;
 	xive->kvm = kvm;
+	mutex_init(&xive->lock);

 	/* Already there ? */
 	if (kvm->arch.xive)
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@ -141,6 +141,7 @@ struct kvmppc_xive {
 	struct kvmppc_xive_ops *ops;
 	struct address_space   *mapping;
 	struct mutex mapping_lock;
+	struct mutex lock;
 };

 #define KVMPPC_XIVE_Q_COUNT	8
--- a/arch/powerpc/kvm/book3s_xive_native.c
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@ -109,12 +109,12 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
 		return -EPERM;
 	if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
 		return -EBUSY;
-	if (server_num >= KVM_MAX_VCPUS) {
+	if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
 		pr_devel("Out of bounds !\n");
 		return -EINVAL;
 	}

-	mutex_lock(&vcpu->kvm->lock);
+	mutex_lock(&xive->lock);

 	if (kvmppc_xive_find_server(vcpu->kvm, server_num)) {
 		pr_devel("Duplicate !\n");
@ -159,7 +159,7 @@ int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,

 	/* TODO: reset all queues to a clean state ? */
 bail:
-	mutex_unlock(&vcpu->kvm->lock);
+	mutex_unlock(&xive->lock);
 	if (rc)
 		kvmppc_xive_native_cleanup_vcpu(vcpu);

@ -172,6 +172,7 @@ bail:
 static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
 {
 	struct kvmppc_xive *xive = kvm->arch.xive;
+	pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;

 	if (irq >= KVMPPC_XIVE_NR_IRQS)
 		return -EINVAL;
@ -185,7 +186,7 @@ static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
 	mutex_lock(&xive->mapping_lock);
 	if (xive->mapping)
 		unmap_mapping_range(xive->mapping,
-				    irq * (2ull << PAGE_SHIFT),
+				    esb_pgoff << PAGE_SHIFT,
 				    2ull << PAGE_SHIFT, 1);
 	mutex_unlock(&xive->mapping_lock);
 	return 0;
@ -535,6 +536,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
 	struct xive_q *q;
 	gfn_t gfn;
 	unsigned long page_size;
+	int srcu_idx;

 	/*
 	 * Demangle priority/server tuple from the EQ identifier
@ -565,24 +567,6 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
 		 __func__, server, priority, kvm_eq.flags,
 		 kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);

-	/*
-	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
-	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
-	 * without using the coalescing mechanisms provided by the
-	 * XIVE END ESBs. This is required on KVM as notification
-	 * using the END ESBs is not supported.
-	 */
-	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
-		pr_err("invalid flags %d\n", kvm_eq.flags);
-		return -EINVAL;
-	}
-
-	rc = xive_native_validate_queue_size(kvm_eq.qshift);
-	if (rc) {
-		pr_err("invalid queue size %d\n", kvm_eq.qshift);
-		return rc;
-	}
-
 	/* reset queue and disable queueing */
 	if (!kvm_eq.qshift) {
 		q->guest_qaddr  = 0;
@ -604,26 +588,48 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
 		return 0;
 	}

+	/*
+	 * sPAPR specifies a "Unconditional Notify (n) flag" for the
+	 * H_INT_SET_QUEUE_CONFIG hcall which forces notification
+	 * without using the coalescing mechanisms provided by the
+	 * XIVE END ESBs. This is required on KVM as notification
+	 * using the END ESBs is not supported.
+	 */
+	if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
+		pr_err("invalid flags %d\n", kvm_eq.flags);
+		return -EINVAL;
+	}
+
+	rc = xive_native_validate_queue_size(kvm_eq.qshift);
+	if (rc) {
+		pr_err("invalid queue size %d\n", kvm_eq.qshift);
+		return rc;
+	}
+
 	if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
 		pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
 		       1ull << kvm_eq.qshift);
 		return -EINVAL;
 	}

+	srcu_idx = srcu_read_lock(&kvm->srcu);
 	gfn = gpa_to_gfn(kvm_eq.qaddr);
 	page = gfn_to_page(kvm, gfn);
 	if (is_error_page(page)) {
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
 		pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
 		return -EINVAL;
 	}

 	page_size = kvm_host_page_size(kvm, gfn);
 	if (1ull << kvm_eq.qshift > page_size) {
+		srcu_read_unlock(&kvm->srcu, srcu_idx);
 		pr_warn("Incompatible host page size %lx!\n", page_size);
 		return -EINVAL;
 	}

 	qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
+	srcu_read_unlock(&kvm->srcu, srcu_idx);

 	/*
 	 * Backup the queue page guest address to the mark EQ page
@ -772,7 +778,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)

 	pr_devel("%s\n", __func__);

-	mutex_lock(&kvm->lock);
+	mutex_lock(&xive->lock);

 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
@ -810,7 +816,7 @@ static int kvmppc_xive_reset(struct kvmppc_xive *xive)
 		}
 	}

-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&xive->lock);

 	return 0;
 }
@ -854,6 +860,7 @@ static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
 	unsigned int prio;
+	int srcu_idx;

 	if (!xc)
 		return -ENOENT;
@ -865,7 +872,9 @@ static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
 			continue;

 		/* Mark EQ page dirty for migration */
+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
 	}
 	return 0;
 }
@ -878,7 +887,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)

 	pr_devel("%s\n", __func__);

-	mutex_lock(&kvm->lock);
+	mutex_lock(&xive->lock);
 	for (i = 0; i <= xive->max_sbid; i++) {
 		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];

@ -892,7 +901,7 @@ static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		kvmppc_xive_native_vcpu_eq_sync(vcpu);
 	}
-	mutex_unlock(&kvm->lock);
+	mutex_unlock(&xive->lock);

 	return 0;
 }
@ -965,7 +974,7 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
 }

 /*
- * Called when device fd is closed
+ * Called when device fd is closed.  kvm->lock is held.
 */
 static void kvmppc_xive_native_release(struct kvm_device *dev)
 {
@ -973,21 +982,18 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
 	struct kvm *kvm = xive->kvm;
 	struct kvm_vcpu *vcpu;
 	int i;
-	int was_ready;
-
-	debugfs_remove(xive->dentry);

 	pr_devel("Releasing xive native device\n");

 	/*
-	 * Clearing mmu_ready temporarily while holding kvm->lock
-	 * is a way of ensuring that no vcpus can enter the guest
-	 * until we drop kvm->lock.  Doing kick_all_cpus_sync()
-	 * ensures that any vcpu executing inside the guest has
-	 * exited the guest.  Once kick_all_cpus_sync() has finished,
-	 * we know that no vcpu can be executing the XIVE push or
-	 * pull code or accessing the XIVE MMIO regions.
-	 *
+	 * Clear the KVM device file address_space which is used to
+	 * unmap the ESB pages when a device is passed-through.
+	 */
+	mutex_lock(&xive->mapping_lock);
+	xive->mapping = NULL;
+	mutex_unlock(&xive->mapping_lock);
+
+	/*
 	 * Since this is the device release function, we know that
 	 * userspace does not have any open fd or mmap referring to
 	 * the device.  Therefore there can not be any of the
@ -996,9 +1002,8 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
 	 * connect_vcpu and set/clr_mapped functions also cannot
 	 * be being executed.
 	 */
-	was_ready = kvm->arch.mmu_ready;
-	kvm->arch.mmu_ready = 0;
-	kick_all_cpus_sync();
+
+	debugfs_remove(xive->dentry);

 	/*
 	 * We should clean up the vCPU interrupt presenters first.
@ -1007,12 +1012,22 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
 		/*
 		 * Take vcpu->mutex to ensure that no one_reg get/set ioctl
 		 * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
+		 * Holding the vcpu->mutex also means that the vcpu cannot
+		 * be executing the KVM_RUN ioctl, and therefore it cannot
+		 * be executing the XIVE push or pull code or accessing
+		 * the XIVE MMIO regions.
 		 */
 		mutex_lock(&vcpu->mutex);
 		kvmppc_xive_native_cleanup_vcpu(vcpu);
 		mutex_unlock(&vcpu->mutex);
 	}

+	/*
+	 * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+	 * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+	 * against xive code getting called during vcpu execution or
+	 * set/get one_reg operations.
+	 */
 	kvm->arch.xive = NULL;

 	for (i = 0; i <= xive->max_sbid; i++) {
@ -1025,8 +1040,6 @@ static void kvmppc_xive_native_release(struct kvm_device *dev)
 	if (xive->vp_base != XIVE_INVALID_VP)
 		xive_native_free_vp_block(xive->vp_base);

-	kvm->arch.mmu_ready = was_ready;
-
 	/*
 	 * A reference of the kvmppc_xive pointer is now kept under
 	 * the xive_devices struct of the machine for reuse. It is
@ -1060,6 +1073,7 @@ static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
 	xive->kvm = kvm;
 	kvm->arch.xive = xive;
 	mutex_init(&xive->mapping_lock);
+	mutex_init(&xive->lock);

 	/*
 	 * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@ -657,6 +657,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = KVM_MAX_VCPU_ID;
+		break;
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_PPC_GET_SMMU_INFO:
 		r = 1;
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@ -539,6 +539,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		break;
 	case KVM_CAP_NR_VCPUS:
 	case KVM_CAP_MAX_VCPUS:
+	case KVM_CAP_MAX_VCPU_ID:
 		r = KVM_S390_BSCA_CPU_SLOTS;
 		if (!kvm_s390_use_sca_entries())
 			r = KVM_MAX_VCPUS;
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -3122,6 +3122,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = KVM_MAX_VCPU_ID;
+		break;
 	case KVM_CAP_PV_MMU:	/* obsolete */
 		r = 0;
 		break;
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@ -224,6 +224,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		break;
+	case KVM_CAP_MAX_VCPU_ID:
+		r = KVM_MAX_VCPU_ID;
+		break;
 	case KVM_CAP_MSI_DEVID:
 		if (!kvm)
 			r = -EINVAL;
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@ -1795,8 +1795,10 @@ void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,

 	if (map->page)
 		kunmap(map->page);
+#ifdef CONFIG_HAS_IOMEM
 	else
 		memunmap(map->hva);
+#endif

 	if (dirty) {
 		kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
@ -3149,8 +3151,6 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_MULTI_ADDRESS_SPACE:
 		return KVM_ADDRESS_SPACE_NUM;
 #endif
-	case KVM_CAP_MAX_VCPU_ID:
-		return KVM_MAX_VCPU_ID;
 	case KVM_CAP_NR_MEMSLOTS:
 		return KVM_USER_MEM_SLOTS;
 	default: