From e43a028752fed049e4bd94ef895542f96d79fa74 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sat, 6 Oct 2012 03:56:35 +0200
Subject: [PATCH 01/13] KVM: PPC: 44x: fix DCR read/write

When remembering the direction of a DCR transaction, we should write
to the same variable that we interpret on later when doing vcpu_run
again.

Signed-off-by: Alexander Graf <agraf@suse.de>
Cc: stable@vger.kernel.org
---
 arch/powerpc/kvm/44x_emulate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 1a793c4c4a67..35ec0a8547da 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -46,6 +46,7 @@ static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn)
 		vcpu->run->dcr.dcrn = dcrn;
 		vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs);
 		vcpu->run->dcr.is_write = 1;
+		vcpu->arch.dcr_is_write = 1;
 		vcpu->arch.dcr_needed = 1;
 		kvmppc_account_exit(vcpu, DCR_EXITS);
 		return EMULATE_DO_DCR;
@@ -80,6 +81,7 @@ static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn)
 		vcpu->run->dcr.dcrn = dcrn;
 		vcpu->run->dcr.data =  0;
 		vcpu->run->dcr.is_write = 0;
+		vcpu->arch.dcr_is_write = 0;
 		vcpu->arch.io_gpr = rt;
 		vcpu->arch.dcr_needed = 1;
 		kvmppc_account_exit(vcpu, DCR_EXITS);

From 686de182a217d315e50fef74ba5e7bcd7ea6c246 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sun, 7 Oct 2012 15:22:59 +0200
Subject: [PATCH 02/13] KVM: Documentation: Fix reentry-to-be-consistent
 paragraph

All user space offloaded instruction emulation needs to reenter kvm
to produce consistent state again. Fix the section in the documentation
to mention all of them.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index 4258180b1ecd..6671fdc0afb1 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2183,7 +2183,8 @@ executed a memory-mapped I/O instruction which could not be satisfied
 by kvm.  The 'data' member contains the written data if 'is_write' is
 true, and should be filled by application code otherwise.
 
-NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding
+NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR
+      and KVM_EXIT_PAPR the corresponding
 operations are complete (and guest state is consistent) only after userspace
 has re-entered the kernel with KVM_RUN.  The kernel side will first finish
 incomplete operations and then check for pending signals.  Userspace

From 388cf9ee3c751c3a4cf8776987143354d6d8c797 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sat, 6 Oct 2012 23:19:01 +0200
Subject: [PATCH 03/13] KVM: PPC: Move mtspr/mfspr emulation into own functions

The mtspr/mfspr emulation code became quite big over time. Move it
into its own function so things stay more readable.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/emulate.c | 221 ++++++++++++++++++++-----------------
 1 file changed, 121 insertions(+), 100 deletions(-)

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index ee04abaefe23..b0855e5d8905 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -131,6 +131,125 @@ u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb)
 	return vcpu->arch.dec - jd;
 }
 
+static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = kvmppc_get_gpr(vcpu, rs);
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		vcpu->arch.shared->srr0 = spr_val;
+		break;
+	case SPRN_SRR1:
+		vcpu->arch.shared->srr1 = spr_val;
+		break;
+
+	/* XXX We need to context-switch the timebase for
+	 * watchdog and FIT. */
+	case SPRN_TBWL: break;
+	case SPRN_TBWU: break;
+
+	case SPRN_MSSSR0: break;
+
+	case SPRN_DEC:
+		vcpu->arch.dec = spr_val;
+		kvmppc_emulate_dec(vcpu);
+		break;
+
+	case SPRN_SPRG0:
+		vcpu->arch.shared->sprg0 = spr_val;
+		break;
+	case SPRN_SPRG1:
+		vcpu->arch.shared->sprg1 = spr_val;
+		break;
+	case SPRN_SPRG2:
+		vcpu->arch.shared->sprg2 = spr_val;
+		break;
+	case SPRN_SPRG3:
+		vcpu->arch.shared->sprg3 = spr_val;
+		break;
+
+	default:
+		emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
+						     spr_val);
+		if (emulated == EMULATE_FAIL)
+			printk(KERN_INFO "mtspr: unknown spr "
+				"0x%x\n", sprn);
+		break;
+	}
+
+	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+
+	return emulated;
+}
+
+static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	enum emulation_result emulated = EMULATE_DONE;
+	ulong spr_val = 0;
+
+	switch (sprn) {
+	case SPRN_SRR0:
+		spr_val = vcpu->arch.shared->srr0;
+		break;
+	case SPRN_SRR1:
+		spr_val = vcpu->arch.shared->srr1;
+		break;
+	case SPRN_PVR:
+		spr_val = vcpu->arch.pvr;
+		break;
+	case SPRN_PIR:
+		spr_val = vcpu->vcpu_id;
+		break;
+	case SPRN_MSSSR0:
+		spr_val = 0;
+		break;
+
+	/* Note: mftb and TBRL/TBWL are user-accessible, so
+	 * the guest can always access the real TB anyways.
+	 * In fact, we probably will never see these traps. */
+	case SPRN_TBWL:
+		spr_val = get_tb() >> 32;
+		break;
+	case SPRN_TBWU:
+		spr_val = get_tb();
+		break;
+
+	case SPRN_SPRG0:
+		spr_val = vcpu->arch.shared->sprg0;
+		break;
+	case SPRN_SPRG1:
+		spr_val = vcpu->arch.shared->sprg1;
+		break;
+	case SPRN_SPRG2:
+		spr_val = vcpu->arch.shared->sprg2;
+		break;
+	case SPRN_SPRG3:
+		spr_val = vcpu->arch.shared->sprg3;
+		break;
+	/* Note: SPRG4-7 are user-readable, so we don't get
+	 * a trap. */
+
+	case SPRN_DEC:
+		spr_val = kvmppc_get_dec(vcpu, get_tb());
+		break;
+	default:
+		emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
+						     &spr_val);
+		if (unlikely(emulated == EMULATE_FAIL)) {
+			printk(KERN_INFO "mfspr: unknown spr "
+				"0x%x\n", sprn);
+		}
+		break;
+	}
+
+	if (emulated == EMULATE_DONE)
+		kvmppc_set_gpr(vcpu, rt, spr_val);
+	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+
+	return emulated;
+}
+
 /* XXX to do:
  * lhax
  * lhaux
@@ -156,7 +275,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int sprn = get_sprn(inst);
 	enum emulation_result emulated = EMULATE_DONE;
 	int advance = 1;
-	ulong spr_val = 0;
 
 	/* this default type might be overwritten by subcategories */
 	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
@@ -236,62 +354,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_MFSPR:
-			switch (sprn) {
-			case SPRN_SRR0:
-				spr_val = vcpu->arch.shared->srr0;
-				break;
-			case SPRN_SRR1:
-				spr_val = vcpu->arch.shared->srr1;
-				break;
-			case SPRN_PVR:
-				spr_val = vcpu->arch.pvr;
-				break;
-			case SPRN_PIR:
-				spr_val = vcpu->vcpu_id;
-				break;
-			case SPRN_MSSSR0:
-				spr_val = 0;
-				break;
-
-			/* Note: mftb and TBRL/TBWL are user-accessible, so
-			 * the guest can always access the real TB anyways.
-			 * In fact, we probably will never see these traps. */
-			case SPRN_TBWL:
-				spr_val = get_tb() >> 32;
-				break;
-			case SPRN_TBWU:
-				spr_val = get_tb();
-				break;
-
-			case SPRN_SPRG0:
-				spr_val = vcpu->arch.shared->sprg0;
-				break;
-			case SPRN_SPRG1:
-				spr_val = vcpu->arch.shared->sprg1;
-				break;
-			case SPRN_SPRG2:
-				spr_val = vcpu->arch.shared->sprg2;
-				break;
-			case SPRN_SPRG3:
-				spr_val = vcpu->arch.shared->sprg3;
-				break;
-			/* Note: SPRG4-7 are user-readable, so we don't get
-			 * a trap. */
-
-			case SPRN_DEC:
-				spr_val = kvmppc_get_dec(vcpu, get_tb());
-				break;
-			default:
-				emulated = kvmppc_core_emulate_mfspr(vcpu, sprn,
-								     &spr_val);
-				if (unlikely(emulated == EMULATE_FAIL)) {
-					printk(KERN_INFO "mfspr: unknown spr "
-						"0x%x\n", sprn);
-				}
-				break;
-			}
-			kvmppc_set_gpr(vcpu, rt, spr_val);
-			kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
+			emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt);
 			break;
 
 		case OP_31_XOP_STHX:
@@ -308,49 +371,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case OP_31_XOP_MTSPR:
-			spr_val = kvmppc_get_gpr(vcpu, rs);
-			switch (sprn) {
-			case SPRN_SRR0:
-				vcpu->arch.shared->srr0 = spr_val;
-				break;
-			case SPRN_SRR1:
-				vcpu->arch.shared->srr1 = spr_val;
-				break;
-
-			/* XXX We need to context-switch the timebase for
-			 * watchdog and FIT. */
-			case SPRN_TBWL: break;
-			case SPRN_TBWU: break;
-
-			case SPRN_MSSSR0: break;
-
-			case SPRN_DEC:
-				vcpu->arch.dec = spr_val;
-				kvmppc_emulate_dec(vcpu);
-				break;
-
-			case SPRN_SPRG0:
-				vcpu->arch.shared->sprg0 = spr_val;
-				break;
-			case SPRN_SPRG1:
-				vcpu->arch.shared->sprg1 = spr_val;
-				break;
-			case SPRN_SPRG2:
-				vcpu->arch.shared->sprg2 = spr_val;
-				break;
-			case SPRN_SPRG3:
-				vcpu->arch.shared->sprg3 = spr_val;
-				break;
-
-			default:
-				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn,
-								     spr_val);
-				if (emulated == EMULATE_FAIL)
-					printk(KERN_INFO "mtspr: unknown spr "
-						"0x%x\n", sprn);
-				break;
-			}
-			kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
+			emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs);
 			break;
 
 		case OP_31_XOP_DCBI:

From c99ec973a63e2249020d6d93a46d7572432da6a2 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Sat, 27 Oct 2012 19:26:14 +0200
Subject: [PATCH 04/13] PPC: ePAPR: Convert header to uapi

The new uapi framework splits kernel internal and user space exported
bits of header files more cleanly. Adjust the ePAPR header accordingly.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/Kbuild              |  1 -
 arch/powerpc/include/asm/epapr_hcalls.h      | 55 +----------
 arch/powerpc/include/uapi/asm/Kbuild         |  1 +
 arch/powerpc/include/uapi/asm/epapr_hcalls.h | 98 ++++++++++++++++++++
 4 files changed, 100 insertions(+), 55 deletions(-)
 create mode 100644 arch/powerpc/include/uapi/asm/epapr_hcalls.h

diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild
index 13d6b7bf3b69..7e313f1ed183 100644
--- a/arch/powerpc/include/asm/Kbuild
+++ b/arch/powerpc/include/asm/Kbuild
@@ -34,6 +34,5 @@ header-y += termios.h
 header-y += types.h
 header-y += ucontext.h
 header-y += unistd.h
-header-y += epapr_hcalls.h
 
 generic-y += rwsem.h
diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
index b8d94459a929..58997afcd085 100644
--- a/arch/powerpc/include/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -50,60 +50,7 @@
 #ifndef _EPAPR_HCALLS_H
 #define _EPAPR_HCALLS_H
 
-#define EV_BYTE_CHANNEL_SEND		1
-#define EV_BYTE_CHANNEL_RECEIVE		2
-#define EV_BYTE_CHANNEL_POLL		3
-#define EV_INT_SET_CONFIG		4
-#define EV_INT_GET_CONFIG		5
-#define EV_INT_SET_MASK			6
-#define EV_INT_GET_MASK			7
-#define EV_INT_IACK			9
-#define EV_INT_EOI			10
-#define EV_INT_SEND_IPI			11
-#define EV_INT_SET_TASK_PRIORITY	12
-#define EV_INT_GET_TASK_PRIORITY	13
-#define EV_DOORBELL_SEND		14
-#define EV_MSGSND			15
-#define EV_IDLE				16
-
-/* vendor ID: epapr */
-#define EV_LOCAL_VENDOR_ID		0	/* for private use */
-#define EV_EPAPR_VENDOR_ID		1
-#define EV_FSL_VENDOR_ID		2	/* Freescale Semiconductor */
-#define EV_IBM_VENDOR_ID		3	/* IBM */
-#define EV_GHS_VENDOR_ID		4	/* Green Hills Software */
-#define EV_ENEA_VENDOR_ID		5	/* Enea */
-#define EV_WR_VENDOR_ID			6	/* Wind River Systems */
-#define EV_AMCC_VENDOR_ID		7	/* Applied Micro Circuits */
-#define EV_KVM_VENDOR_ID		42	/* KVM */
-
-/* The max number of bytes that a byte channel can send or receive per call */
-#define EV_BYTE_CHANNEL_MAX_BYTES	16
-
-
-#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
-#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
-
-/* epapr return codes */
-#define EV_SUCCESS		0
-#define EV_EPERM		1	/* Operation not permitted */
-#define EV_ENOENT		2	/*  Entry Not Found */
-#define EV_EIO			3	/* I/O error occured */
-#define EV_EAGAIN		4	/* The operation had insufficient
-					 * resources to complete and should be
-					 * retried
-					 */
-#define EV_ENOMEM		5	/* There was insufficient memory to
-					 * complete the operation */
-#define EV_EFAULT		6	/* Bad guest address */
-#define EV_ENODEV		7	/* No such device */
-#define EV_EINVAL		8	/* An argument supplied to the hcall
-					   was out of range or invalid */
-#define EV_INTERNAL		9	/* An internal error occured */
-#define EV_CONFIG		10	/* A configuration error was detected */
-#define EV_INVALID_STATE	11	/* The object is in an invalid state */
-#define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
-#define EV_BUFFER_OVERFLOW	13	/* Caller-supplied buffer too small */
+#include <uapi/asm/epapr_hcalls.h>
 
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
diff --git a/arch/powerpc/include/uapi/asm/Kbuild b/arch/powerpc/include/uapi/asm/Kbuild
index baebb3da1d44..e6b5be86e4fa 100644
--- a/arch/powerpc/include/uapi/asm/Kbuild
+++ b/arch/powerpc/include/uapi/asm/Kbuild
@@ -1,3 +1,4 @@
 # UAPI Header export list
 include include/uapi/asm-generic/Kbuild.asm
 
+header-y += epapr_hcalls.h
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
new file mode 100644
index 000000000000..046c79364f83
--- /dev/null
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -0,0 +1,98 @@
+/*
+ * ePAPR hcall interface
+ *
+ * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Timur Tabi <timur@freescale.com>
+ *
+ * This file is provided under a dual BSD/GPL license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _UAPI__EPAPR_HCALLS_H
+#define _UAPI__EPAPR_HCALLS_H
+
+#define EV_BYTE_CHANNEL_SEND		1
+#define EV_BYTE_CHANNEL_RECEIVE		2
+#define EV_BYTE_CHANNEL_POLL		3
+#define EV_INT_SET_CONFIG		4
+#define EV_INT_GET_CONFIG		5
+#define EV_INT_SET_MASK			6
+#define EV_INT_GET_MASK			7
+#define EV_INT_IACK			9
+#define EV_INT_EOI			10
+#define EV_INT_SEND_IPI			11
+#define EV_INT_SET_TASK_PRIORITY	12
+#define EV_INT_GET_TASK_PRIORITY	13
+#define EV_DOORBELL_SEND		14
+#define EV_MSGSND			15
+#define EV_IDLE				16
+
+/* vendor ID: epapr */
+#define EV_LOCAL_VENDOR_ID		0	/* for private use */
+#define EV_EPAPR_VENDOR_ID		1
+#define EV_FSL_VENDOR_ID		2	/* Freescale Semiconductor */
+#define EV_IBM_VENDOR_ID		3	/* IBM */
+#define EV_GHS_VENDOR_ID		4	/* Green Hills Software */
+#define EV_ENEA_VENDOR_ID		5	/* Enea */
+#define EV_WR_VENDOR_ID			6	/* Wind River Systems */
+#define EV_AMCC_VENDOR_ID		7	/* Applied Micro Circuits */
+#define EV_KVM_VENDOR_ID		42	/* KVM */
+
+/* The max number of bytes that a byte channel can send or receive per call */
+#define EV_BYTE_CHANNEL_MAX_BYTES	16
+
+
+#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
+#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
+
+/* epapr return codes */
+#define EV_SUCCESS		0
+#define EV_EPERM		1	/* Operation not permitted */
+#define EV_ENOENT		2	/*  Entry Not Found */
+#define EV_EIO			3	/* I/O error occured */
+#define EV_EAGAIN		4	/* The operation had insufficient
+					 * resources to complete and should be
+					 * retried
+					 */
+#define EV_ENOMEM		5	/* There was insufficient memory to
+					 * complete the operation */
+#define EV_EFAULT		6	/* Bad guest address */
+#define EV_ENODEV		7	/* No such device */
+#define EV_EINVAL		8	/* An argument supplied to the hcall
+					   was out of range or invalid */
+#define EV_INTERNAL		9	/* An internal error occured */
+#define EV_CONFIG		10	/* A configuration error was detected */
+#define EV_INVALID_STATE	11	/* The object is in an invalid state */
+#define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
+#define EV_BUFFER_OVERFLOW	13	/* Caller-supplied buffer too small */
+
+#endif /* _UAPI__EPAPR_HCALLS_H */

From 512691d4907d7cf4b8d05c6f8572d1fa60ccec20 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:15:41 +0000
Subject: [PATCH 05/13] KVM: PPC: Book3S HV: Allow KVM guests to stop secondary
 threads coming online

When a Book3S HV KVM guest is running, we need the host to be in
single-thread mode, that is, all of the cores (or at least all of
the cores where the KVM guest could run) to be running only one
active hardware thread.  This is because of the hardware restriction
in POWER processors that all of the hardware threads in the core
must be in the same logical partition.  Complying with this restriction
is much easier if, from the host kernel's point of view, only one
hardware thread is active.

This adds two hooks in the SMP hotplug code to allow the KVM code to
make sure that secondary threads (i.e. hardware threads other than
thread 0) cannot come online while any KVM guest exists.  The KVM
code still has to check that any core where it runs a guest has the
secondary threads offline, but having done that check it can now be
sure that they will not come online while the guest is running.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/smp.h |  8 ++++++
 arch/powerpc/kernel/smp.c      | 46 ++++++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv.c   | 12 +++++++--
 3 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index ebc24dc5b1a1..b625a1a9ad16 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -66,6 +66,14 @@ void generic_cpu_die(unsigned int cpu);
 void generic_mach_cpu_die(void);
 void generic_set_cpu_dead(unsigned int cpu);
 int generic_check_cpu_restart(unsigned int cpu);
+
+extern void inhibit_secondary_onlining(void);
+extern void uninhibit_secondary_onlining(void);
+
+#else /* HOTPLUG_CPU */
+static inline void inhibit_secondary_onlining(void) {}
+static inline void uninhibit_secondary_onlining(void) {}
+
 #endif
 
 #ifdef CONFIG_PPC64
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8d4214afc21d..c4f420c5fc1b 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -417,6 +417,45 @@ int generic_check_cpu_restart(unsigned int cpu)
 {
 	return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE;
 }
+
+static atomic_t secondary_inhibit_count;
+
+/*
+ * Don't allow secondary CPU threads to come online
+ */
+void inhibit_secondary_onlining(void)
+{
+	/*
+	 * This makes secondary_inhibit_count stable during cpu
+	 * online/offline operations.
+	 */
+	get_online_cpus();
+
+	atomic_inc(&secondary_inhibit_count);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(inhibit_secondary_onlining);
+
+/*
+ * Allow secondary CPU threads to come online again
+ */
+void uninhibit_secondary_onlining(void)
+{
+	get_online_cpus();
+	atomic_dec(&secondary_inhibit_count);
+	put_online_cpus();
+}
+EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining);
+
+static int secondaries_inhibited(void)
+{
+	return atomic_read(&secondary_inhibit_count);
+}
+
+#else /* HOTPLUG_CPU */
+
+#define secondaries_inhibited()		0
+
 #endif
 
 static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle)
@@ -435,6 +474,13 @@ int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc, c;
 
+	/*
+	 * Don't allow secondary threads to come online if inhibited
+	 */
+	if (threads_per_core > 1 && secondaries_inhibited() &&
+	    cpu % threads_per_core != 0)
+		return -EBUSY;
+
 	if (smp_ops == NULL ||
 	    (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))
 		return -EINVAL;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 9a15da76e56b..c5ddf048e19e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -47,6 +47,7 @@
 #include <asm/page.h>
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
+#include <asm/smp.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -1016,8 +1017,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	/*
 	 * Make sure we are running on thread 0, and that
 	 * secondary threads are offline.
-	 * XXX we should also block attempts to bring any
-	 * secondary threads online.
 	 */
 	if (threads_per_core > 1 && !on_primary_thread()) {
 		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
@@ -1730,11 +1729,20 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 
 	kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206);
 	spin_lock_init(&kvm->arch.slot_phys_lock);
+
+	/*
+	 * Don't allow secondary CPU threads to come online
+	 * while any KVM VMs exist.
+	 */
+	inhibit_secondary_onlining();
+
 	return 0;
 }
 
 void kvmppc_core_destroy_vm(struct kvm *kvm)
 {
+	uninhibit_secondary_onlining();
+
 	if (kvm->arch.rma) {
 		kvm_release_rma(kvm->arch.rma);
 		kvm->arch.rma = NULL;

From 7b444c6710c6c4994e31eb19216ce055836e65c4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:16:14 +0000
Subject: [PATCH 06/13] KVM: PPC: Book3S HV: Fix some races in starting
 secondary threads

Subsequent patches implementing in-kernel XICS emulation will make it
possible for IPIs to arrive at secondary threads at arbitrary times.
This fixes some races in how we start the secondary threads, which
if not fixed could lead to occasional crashes of the host kernel.

This makes sure that (a) we have grabbed all the secondary threads,
and verified that they are no longer in the kernel, before we start
any thread, (b) that the secondary thread loads its vcpu pointer
after clearing the IPI that woke it up (so we don't miss a wakeup),
and (c) that the secondary thread clears its vcpu pointer before
incrementing the nap count.  It also removes unnecessary setting
of the vcpu and vcore pointers in the paca in kvmppc_core_vcpu_load.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s_hv.c            | 41 +++++++++++++++----------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 11 +++++--
 2 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index c5ddf048e19e..77dec0f8a030 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -64,8 +64,6 @@ void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	local_paca->kvm_hstate.kvm_vcpu = vcpu;
-	local_paca->kvm_hstate.kvm_vcore = vc;
 	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
 		vc->stolen_tb += mftb() - vc->preempt_tb;
 }
@@ -880,6 +878,7 @@ static int kvmppc_grab_hwthread(int cpu)
 
 	/* Ensure the thread won't go into the kernel if it wakes */
 	tpaca->kvm_hstate.hwthread_req = 1;
+	tpaca->kvm_hstate.kvm_vcpu = NULL;
 
 	/*
 	 * If the thread is already executing in the kernel (e.g. handling
@@ -929,7 +928,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	smp_wmb();
 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
 	if (vcpu->arch.ptid) {
-		kvmppc_grab_hwthread(cpu);
 		xics_wake_cpu(cpu);
 		++vc->n_woken;
 	}
@@ -955,7 +953,8 @@ static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
 
 /*
  * Check that we are on thread 0 and that any other threads in
- * this core are off-line.
+ * this core are off-line.  Then grab the threads so they can't
+ * enter the kernel.
  */
 static int on_primary_thread(void)
 {
@@ -967,6 +966,17 @@ static int on_primary_thread(void)
 	while (++thr < threads_per_core)
 		if (cpu_online(cpu + thr))
 			return 0;
+
+	/* Grab all hw threads so they can't go into the kernel */
+	for (thr = 1; thr < threads_per_core; ++thr) {
+		if (kvmppc_grab_hwthread(cpu + thr)) {
+			/* Couldn't grab one; let the others go */
+			do {
+				kvmppc_release_hwthread(cpu + thr);
+			} while (--thr > 0);
+			return 0;
+		}
+	}
 	return 1;
 }
 
@@ -1014,16 +1024,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		spin_lock(&vc->lock);
 	}
 
-	/*
-	 * Make sure we are running on thread 0, and that
-	 * secondary threads are offline.
-	 */
-	if (threads_per_core > 1 && !on_primary_thread()) {
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-			vcpu->arch.ret = -EBUSY;
-		goto out;
-	}
-
 	/*
 	 * Assign physical thread IDs, first to non-ceded vcpus
 	 * and then to ceded ones.
@@ -1043,15 +1043,22 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		if (vcpu->arch.ceded)
 			vcpu->arch.ptid = ptid++;
 
+	/*
+	 * Make sure we are running on thread 0, and that
+	 * secondary threads are offline.
+	 */
+	if (threads_per_core > 1 && !on_primary_thread()) {
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+			vcpu->arch.ret = -EBUSY;
+		goto out;
+	}
+
 	vc->stolen_tb += mftb() - vc->preempt_tb;
 	vc->pcpu = smp_processor_id();
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		kvmppc_start_thread(vcpu);
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
-	/* Grab any remaining hw threads so they can't go into the kernel */
-	for (i = ptid; i < threads_per_core; ++i)
-		kvmppc_grab_hwthread(vc->pcpu + i);
 
 	preempt_disable();
 	spin_unlock(&vc->lock);
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 44b72feaff7d..1e90ef6191a3 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -134,8 +134,11 @@ kvm_start_guest:
 
 27:	/* XXX should handle hypervisor maintenance interrupts etc. here */
 
+	/* reload vcpu pointer after clearing the IPI */
+	ld	r4,HSTATE_KVM_VCPU(r13)
+	cmpdi	r4,0
 	/* if we have no vcpu to run, go back to sleep */
-	beq	cr1,kvm_no_guest
+	beq	kvm_no_guest
 
 	/* were we napping due to cede? */
 	lbz	r0,HSTATE_NAPPING(r13)
@@ -1587,6 +1590,10 @@ secondary_too_late:
 	.endr
 
 secondary_nap:
+	/* Clear our vcpu pointer so we don't come back in early */
+	li	r0, 0
+	std	r0, HSTATE_KVM_VCPU(r13)
+	lwsync
 	/* Clear any pending IPI - assume we're a secondary thread */
 	ld	r5, HSTATE_XICS_PHYS(r13)
 	li	r7, XICS_XIRR
@@ -1612,8 +1619,6 @@ secondary_nap:
 kvm_no_guest:
 	li	r0, KVM_HWTHREAD_IN_NAP
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
-	li	r0, 0
-	std	r0, HSTATE_KVM_VCPU(r13)
 
 	li	r3, LPCR_PECE0
 	mfspr	r4, SPRN_LPCR

From 913d3ff9a3c3a13c3115eb4b3265aa35a9e0a7ad Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:16:48 +0000
Subject: [PATCH 07/13] KVM: PPC: Book3s HV: Don't access runnable threads list
 without vcore lock

There were a few places where we were traversing the list of runnable
threads in a virtual core, i.e. vc->runnable_threads, without holding
the vcore spinlock.  This extends the places where we hold the vcore
spinlock to cover everywhere that we traverse that list.

Since we possibly need to sleep inside kvmppc_book3s_hv_page_fault,
this moves the call of it from kvmppc_handle_exit out to
kvmppc_vcpu_run, where we don't hold the vcore lock.

In kvmppc_vcore_blocked, we don't actually need to check whether
all vcpus are ceded and don't have any pending exceptions, since the
caller has already done that.  The caller (kvmppc_run_vcpu) wasn't
actually checking for pending exceptions, so we add that.

The change of if to while in kvmppc_run_vcpu is to make sure that we
never call kvmppc_remove_runnable() when the vcore state is RUNNING or
EXITING.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_asm.h |  1 +
 arch/powerpc/kvm/book3s_hv.c       | 67 +++++++++++++++---------------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
index 76fdcfef0889..aabcdba8f6b0 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -118,6 +118,7 @@
 
 #define RESUME_FLAG_NV          (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST        (1<<1)  /* Resume host? */
+#define RESUME_FLAG_ARCH1	(1<<2)
 
 #define RESUME_GUEST            0
 #define RESUME_GUEST_NV         RESUME_FLAG_NV
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 77dec0f8a030..3a737a4bb8bf 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -57,6 +57,9 @@
 /* #define EXIT_DEBUG_SIMPLE */
 /* #define EXIT_DEBUG_INT */
 
+/* Used to indicate that a guest page fault needs to be handled */
+#define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -431,7 +434,6 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			      struct task_struct *tsk)
 {
 	int r = RESUME_HOST;
-	int srcu_idx;
 
 	vcpu->stat.sum_exits++;
 
@@ -491,16 +493,12 @@ static int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	 * have been handled already.
 	 */
 	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
-		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = kvmppc_book3s_hv_page_fault(run, vcpu,
-				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
-		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		r = RESUME_PAGE_FAULT;
 		break;
 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
-		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
-		r = kvmppc_book3s_hv_page_fault(run, vcpu,
-				kvmppc_get_pc(vcpu), 0);
-		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+		vcpu->arch.fault_dsisr = 0;
+		r = RESUME_PAGE_FAULT;
 		break;
 	/*
 	 * This occurs if the guest executes an illegal instruction.
@@ -984,22 +982,24 @@ static int on_primary_thread(void)
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
  */
-static int kvmppc_run_core(struct kvmppc_vcore *vc)
+static void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
 	struct kvm_vcpu *vcpu, *vcpu0, *vnext;
 	long ret;
 	u64 now;
 	int ptid, i, need_vpa_update;
 	int srcu_idx;
+	struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
 	/* don't start if any threads have a signal pending */
 	need_vpa_update = 0;
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		if (signal_pending(vcpu->arch.run_task))
-			return 0;
-		need_vpa_update |= vcpu->arch.vpa.update_pending |
-			vcpu->arch.slb_shadow.update_pending |
-			vcpu->arch.dtl.update_pending;
+			return;
+		if (vcpu->arch.vpa.update_pending ||
+		    vcpu->arch.slb_shadow.update_pending ||
+		    vcpu->arch.dtl.update_pending)
+			vcpus_to_update[need_vpa_update++] = vcpu;
 	}
 
 	/*
@@ -1019,8 +1019,8 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	 */
 	if (need_vpa_update) {
 		spin_unlock(&vc->lock);
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-			kvmppc_update_vpas(vcpu);
+		for (i = 0; i < need_vpa_update; ++i)
+			kvmppc_update_vpas(vcpus_to_update[i]);
 		spin_lock(&vc->lock);
 	}
 
@@ -1037,8 +1037,10 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 			vcpu->arch.ptid = ptid++;
 		}
 	}
-	if (!vcpu0)
-		return 0;		/* nothing to run */
+	if (!vcpu0) {
+		vc->vcore_state = VCORE_INACTIVE;
+		return;		/* nothing to run; should never happen */
+	}
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		if (vcpu->arch.ceded)
 			vcpu->arch.ptid = ptid++;
@@ -1091,6 +1093,7 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 	preempt_enable();
 	kvm_resched(vcpu);
 
+	spin_lock(&vc->lock);
 	now = get_tb();
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		/* cancel pending dec exception if dec is positive */
@@ -1114,7 +1117,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 		}
 	}
 
-	spin_lock(&vc->lock);
  out:
 	vc->vcore_state = VCORE_INACTIVE;
 	vc->preempt_tb = mftb();
@@ -1125,8 +1127,6 @@ static int kvmppc_run_core(struct kvmppc_vcore *vc)
 			wake_up(&vcpu->arch.cpu_run);
 		}
 	}
-
-	return 1;
 }
 
 /*
@@ -1150,20 +1150,11 @@ static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
 	DEFINE_WAIT(wait);
-	struct kvm_vcpu *v;
-	int all_idle = 1;
 
 	prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
 	vc->vcore_state = VCORE_SLEEPING;
 	spin_unlock(&vc->lock);
-	list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
-		if (!v->arch.ceded || v->arch.pending_exceptions) {
-			all_idle = 0;
-			break;
-		}
-	}
-	if (all_idle)
-		schedule();
+	schedule();
 	finish_wait(&vc->wq, &wait);
 	spin_lock(&vc->lock);
 	vc->vcore_state = VCORE_INACTIVE;
@@ -1219,7 +1210,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		vc->runner = vcpu;
 		n_ceded = 0;
 		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
-			n_ceded += v->arch.ceded;
+			if (!v->arch.pending_exceptions)
+				n_ceded += v->arch.ceded;
 		if (n_ceded == vc->n_runnable)
 			kvmppc_vcore_blocked(vc);
 		else
@@ -1240,8 +1232,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	}
 
 	if (signal_pending(current)) {
-		if (vc->vcore_state == VCORE_RUNNING ||
-		    vc->vcore_state == VCORE_EXITING) {
+		while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+		       (vc->vcore_state == VCORE_RUNNING ||
+			vc->vcore_state == VCORE_EXITING)) {
 			spin_unlock(&vc->lock);
 			kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
 			spin_lock(&vc->lock);
@@ -1261,6 +1254,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	int r;
+	int srcu_idx;
 
 	if (!vcpu->arch.sane) {
 		run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -1299,6 +1293,11 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
 			r = kvmppc_pseries_do_hcall(vcpu);
 			kvmppc_core_prepare_to_enter(vcpu);
+		} else if (r == RESUME_PAGE_FAULT) {
+			srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+			r = kvmppc_book3s_hv_page_fault(run, vcpu,
+				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
+			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
 		}
 	} while (r == RESUME_GUEST);
 

From 2f12f03436847e063cda8cc4c339ad84961cbf39 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:17:17 +0000
Subject: [PATCH 08/13] KVM: PPC: Book3S HV: Fixes for late-joining threads

If a thread in a virtual core becomes runnable while other threads
in the same virtual core are already running in the guest, it is
possible for the latecomer to join the others on the core without
first pulling them all out of the guest.  Currently this only happens
rarely, when a vcpu is first started.  This fixes some bugs and
omissions in the code in this case.

First, we need to check for VPA updates for the latecomer and make
a DTL entry for it.  Secondly, if it comes along while the master
vcpu is doing a VPA update, we don't need to do anything since the
master will pick it up in kvmppc_run_core.  To handle this correctly
we introduce a new vcore state, VCORE_STARTING.  Thirdly, there is
a race because we currently clear the hardware thread's hwthread_req
before waiting to see it get to nap.  A latecomer thread could have
its hwthread_req cleared before it gets to test it, and therefore
never increment the nap_count, leading to messages about wait_for_nap
timeouts.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |  7 ++++---
 arch/powerpc/kvm/book3s_hv.c        | 14 +++++++++++---
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 68f5a308737a..218534d46ae9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -289,9 +289,10 @@ struct kvmppc_vcore {
 
 /* Values for vcore_state */
 #define VCORE_INACTIVE	0
-#define VCORE_RUNNING	1
-#define VCORE_EXITING	2
-#define VCORE_SLEEPING	3
+#define VCORE_SLEEPING	1
+#define VCORE_STARTING	2
+#define VCORE_RUNNING	3
+#define VCORE_EXITING	4
 
 /*
  * Struct used to manage memory for a virtual processor area
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3a737a4bb8bf..89995fa6e945 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -336,6 +336,11 @@ static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap)
 
 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 {
+	if (!(vcpu->arch.vpa.update_pending ||
+	      vcpu->arch.slb_shadow.update_pending ||
+	      vcpu->arch.dtl.update_pending))
+		return;
+
 	spin_lock(&vcpu->arch.vpa_update_lock);
 	if (vcpu->arch.vpa.update_pending) {
 		kvmppc_update_vpa(vcpu, &vcpu->arch.vpa);
@@ -1009,7 +1014,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	vc->n_woken = 0;
 	vc->nap_count = 0;
 	vc->entry_exit_count = 0;
-	vc->vcore_state = VCORE_RUNNING;
+	vc->vcore_state = VCORE_STARTING;
 	vc->in_guest = 0;
 	vc->napping_threads = 0;
 
@@ -1062,6 +1067,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		kvmppc_create_dtl_entry(vcpu, vc);
 	}
 
+	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
 	spin_unlock(&vc->lock);
 
@@ -1070,8 +1076,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu);
 
 	__kvmppc_vcore_entry(NULL, vcpu0);
-	for (i = 0; i < threads_per_core; ++i)
-		kvmppc_release_hwthread(vc->pcpu + i);
 
 	spin_lock(&vc->lock);
 	/* disable sending of IPIs on virtual external irqs */
@@ -1080,6 +1084,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	/* wait for secondary threads to finish writing their state to memory */
 	if (vc->nap_count < vc->n_woken)
 		kvmppc_wait_for_nap(vc);
+	for (i = 0; i < threads_per_core; ++i)
+		kvmppc_release_hwthread(vc->pcpu + i);
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
 	vc->vcore_state = VCORE_EXITING;
 	spin_unlock(&vc->lock);
@@ -1170,6 +1176,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	kvm_run->exit_reason = 0;
 	vcpu->arch.ret = RESUME_GUEST;
 	vcpu->arch.trap = 0;
+	kvmppc_update_vpas(vcpu);
 
 	/*
 	 * Synchronize with other threads in this virtual core
@@ -1193,6 +1200,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		if (vc->vcore_state == VCORE_RUNNING &&
 		    VCORE_EXIT_COUNT(vc) == 0) {
 			vcpu->arch.ptid = vc->n_runnable - 1;
+			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 		}
 

From 8455d79e2163997e479931b8d5b7e60a92cd2b86 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:17:42 +0000
Subject: [PATCH 09/13] KVM: PPC: Book3S HV: Run virtual core whenever any
 vcpus in it can run

Currently the Book3S HV code implements a policy on multi-threaded
processors (i.e. POWER7) that requires all of the active vcpus in a
virtual core to be ready to run before we run the virtual core.
However, that causes problems on reset, because reset stops all vcpus
except vcpu 0, and can also reduce throughput since all four threads
in a virtual core have to wait whenever any one of them hits a
hypervisor page fault.

This relaxes the policy, allowing the virtual core to run as soon as
any vcpu in it is runnable.  With this, the KVMPPC_VCPU_STOPPED state
and the KVMPPC_VCPU_BUSY_IN_HOST state have been combined into a single
KVMPPC_VCPU_NOTREADY state, since we no longer need to distinguish
between them.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |  5 +-
 arch/powerpc/kvm/book3s_hv.c        | 74 +++++++++++++++--------------
 2 files changed, 40 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 218534d46ae9..1e8cbd1299fc 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -563,9 +563,8 @@ struct kvm_vcpu_arch {
 };
 
 /* Values for vcpu->arch.state */
-#define KVMPPC_VCPU_STOPPED		0
-#define KVMPPC_VCPU_BUSY_IN_HOST	1
-#define KVMPPC_VCPU_RUNNABLE		2
+#define KVMPPC_VCPU_NOTREADY		0
+#define KVMPPC_VCPU_RUNNABLE		1
 
 /* Values for vcpu->arch.io_gpr */
 #define KVM_MMIO_REG_MASK	0x001f
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 89995fa6e945..61d293465e81 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -776,10 +776,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
-	/*
-	 * We consider the vcpu stopped until we see the first run ioctl for it.
-	 */
-	vcpu->arch.state = KVMPPC_VCPU_STOPPED;
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
 
 	init_waitqueue_head(&vcpu->arch.cpu_run);
 
@@ -866,9 +863,8 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 {
 	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 		return;
-	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
 	--vc->n_runnable;
-	++vc->n_busy;
 	list_del(&vcpu->arch.run_list);
 }
 
@@ -1169,7 +1165,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	int n_ceded;
-	int prev_state;
 	struct kvmppc_vcore *vc;
 	struct kvm_vcpu *v, *vn;
 
@@ -1186,7 +1181,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	vcpu->arch.ceded = 0;
 	vcpu->arch.run_task = current;
 	vcpu->arch.kvm_run = kvm_run;
-	prev_state = vcpu->arch.state;
 	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
 	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
 	++vc->n_runnable;
@@ -1196,35 +1190,26 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * If the vcore is already running, we may be able to start
 	 * this thread straight away and have it join in.
 	 */
-	if (prev_state == KVMPPC_VCPU_STOPPED) {
+	if (!signal_pending(current)) {
 		if (vc->vcore_state == VCORE_RUNNING &&
 		    VCORE_EXIT_COUNT(vc) == 0) {
 			vcpu->arch.ptid = vc->n_runnable - 1;
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
+		} else if (vc->vcore_state == VCORE_SLEEPING) {
+			wake_up(&vc->wq);
 		}
 
-	} else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST)
-		--vc->n_busy;
+	}
 
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	       !signal_pending(current)) {
-		if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) {
+		if (vc->vcore_state != VCORE_INACTIVE) {
 			spin_unlock(&vc->lock);
 			kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
 			spin_lock(&vc->lock);
 			continue;
 		}
-		vc->runner = vcpu;
-		n_ceded = 0;
-		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
-			if (!v->arch.pending_exceptions)
-				n_ceded += v->arch.ceded;
-		if (n_ceded == vc->n_runnable)
-			kvmppc_vcore_blocked(vc);
-		else
-			kvmppc_run_core(vc);
-
 		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
 					 arch.run_list) {
 			kvmppc_core_prepare_to_enter(v);
@@ -1236,23 +1221,40 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 				wake_up(&v->arch.cpu_run);
 			}
 		}
+		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
+			break;
+		vc->runner = vcpu;
+		n_ceded = 0;
+		list_for_each_entry(v, &vc->runnable_threads, arch.run_list)
+			if (!v->arch.pending_exceptions)
+				n_ceded += v->arch.ceded;
+		if (n_ceded == vc->n_runnable)
+			kvmppc_vcore_blocked(vc);
+		else
+			kvmppc_run_core(vc);
 		vc->runner = NULL;
 	}
 
-	if (signal_pending(current)) {
-		while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
-		       (vc->vcore_state == VCORE_RUNNING ||
-			vc->vcore_state == VCORE_EXITING)) {
-			spin_unlock(&vc->lock);
-			kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
-			spin_lock(&vc->lock);
-		}
-		if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
-			kvmppc_remove_runnable(vc, vcpu);
-			vcpu->stat.signal_exits++;
-			kvm_run->exit_reason = KVM_EXIT_INTR;
-			vcpu->arch.ret = -EINTR;
-		}
+	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
+	       (vc->vcore_state == VCORE_RUNNING ||
+		vc->vcore_state == VCORE_EXITING)) {
+		spin_unlock(&vc->lock);
+		kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
+		spin_lock(&vc->lock);
+	}
+
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		kvmppc_remove_runnable(vc, vcpu);
+		vcpu->stat.signal_exits++;
+		kvm_run->exit_reason = KVM_EXIT_INTR;
+		vcpu->arch.ret = -EINTR;
+	}
+
+	if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) {
+		/* Wake up some vcpu to run the core */
+		v = list_first_entry(&vc->runnable_threads,
+				     struct kvm_vcpu, arch.run_list);
+		wake_up(&v->arch.cpu_run);
 	}
 
 	spin_unlock(&vc->lock);

From c7b676709c163e12ec161c0593c2c76809c25ff4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:18:07 +0000
Subject: [PATCH 10/13] KVM: PPC: Book3S HV: Fix accounting of stolen time

Currently the code that accounts stolen time tends to overestimate the
stolen time, and will sometimes report more stolen time in a DTL
(dispatch trace log) entry than has elapsed since the last DTL entry.
This can cause guests to underflow the user or system time measured
for some tasks, leading to ridiculous CPU percentages and total runtimes
being reported by top and other utilities.

In addition, the current code was designed for the previous policy where
a vcore would only run when all the vcpus in it were runnable, and so
only counted stolen time on a per-vcore basis.  Now that a vcore can
run while some of the vcpus in it are doing other things in the kernel
(e.g. handling a page fault), we need to count the time when a vcpu task
is preempted while it is not running as part of a vcore as stolen also.

To do this, we bring back the BUSY_IN_HOST vcpu state and extend the
vcpu_load/put functions to count preemption time while the vcpu is
in that state.  Handling the transitions between the RUNNING and
BUSY_IN_HOST states requires checking and updating two variables
(accumulated time stolen and time last preempted), so we add a new
spinlock, vcpu->arch.tbacct_lock.  This protects both the per-vcpu
stolen/preempt-time variables, and the per-vcore variables while this
vcpu is running the vcore.

Finally, we now don't count time spent in userspace as stolen time.
The task could be executing in userspace on behalf of the vcpu, or
it could be preempted, or the vcpu could be genuinely stopped.  Since
we have no way of dividing up the time between these cases, we don't
count any of it as stolen.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_host.h |   5 ++
 arch/powerpc/kvm/book3s_hv.c        | 127 ++++++++++++++++++++++++----
 2 files changed, 117 insertions(+), 15 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1e8cbd1299fc..3093896015f0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -559,12 +559,17 @@ struct kvm_vcpu_arch {
 	unsigned long dtl_index;
 	u64 stolen_logged;
 	struct kvmppc_vpa slb_shadow;
+
+	spinlock_t tbacct_lock;
+	u64 busy_stolen;
+	u64 busy_preempt;
 #endif
 };
 
 /* Values for vcpu->arch.state */
 #define KVMPPC_VCPU_NOTREADY		0
 #define KVMPPC_VCPU_RUNNABLE		1
+#define KVMPPC_VCPU_BUSY_IN_HOST	2
 
 /* Values for vcpu->arch.io_gpr */
 #define KVM_MMIO_REG_MASK	0x001f
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 61d293465e81..8b3c470e6cb9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -60,23 +60,74 @@
 /* Used to indicate that a guest page fault needs to be handled */
 #define RESUME_PAGE_FAULT	(RESUME_GUEST | RESUME_FLAG_ARCH1)
 
+/* Used as a "null" value for timebase values */
+#define TB_NIL	(~(u64)0)
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+/*
+ * We use the vcpu_load/put functions to measure stolen time.
+ * Stolen time is counted as time when either the vcpu is able to
+ * run as part of a virtual core, but the task running the vcore
+ * is preempted or sleeping, or when the vcpu needs something done
+ * in the kernel by the task running the vcpu, but that task is
+ * preempted or sleeping.  Those two things have to be counted
+ * separately, since one of the vcpu tasks will take on the job
+ * of running the core, and the other vcpu tasks in the vcore will
+ * sleep waiting for it to do that, but that sleep shouldn't count
+ * as stolen time.
+ *
+ * Hence we accumulate stolen time when the vcpu can run as part of
+ * a vcore using vc->stolen_tb, and the stolen time when the vcpu
+ * needs its task to do other things in the kernel (for example,
+ * service a page fault) in busy_stolen.  We don't accumulate
+ * stolen time for a vcore when it is inactive, or for a vcpu
+ * when it is in state RUNNING or NOTREADY.  NOTREADY is a bit of
+ * a misnomer; it means that the vcpu task is not executing in
+ * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in
+ * the kernel.  We don't have any way of dividing up that time
+ * between time that the vcpu is genuinely stopped, time that
+ * the task is actively working on behalf of the vcpu, and time
+ * that the task is preempted, so we don't count any of it as
+ * stolen.
+ *
+ * Updates to busy_stolen are protected by arch.tbacct_lock;
+ * updates to vc->stolen_tb are protected by the arch.tbacct_lock
+ * of the vcpu that has taken responsibility for running the vcore
+ * (i.e. vc->runner).  The stolen times are measured in units of
+ * timebase ticks.  (Note that the != TB_NIL checks below are
+ * purely defensive; they should never fail.)
+ */
+
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
-	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
+	spin_lock(&vcpu->arch.tbacct_lock);
+	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE &&
+	    vc->preempt_tb != TB_NIL) {
 		vc->stolen_tb += mftb() - vc->preempt_tb;
+		vc->preempt_tb = TB_NIL;
+	}
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
+	    vcpu->arch.busy_preempt != TB_NIL) {
+		vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
+		vcpu->arch.busy_preempt = TB_NIL;
+	}
+	spin_unlock(&vcpu->arch.tbacct_lock);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 
+	spin_lock(&vcpu->arch.tbacct_lock);
 	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE)
 		vc->preempt_tb = mftb();
+	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
+		vcpu->arch.busy_preempt = mftb();
+	spin_unlock(&vcpu->arch.tbacct_lock);
 }
 
 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr)
@@ -357,24 +408,61 @@ static void kvmppc_update_vpas(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->arch.vpa_update_lock);
 }
 
+/*
+ * Return the accumulated stolen time for the vcore up until `now'.
+ * The caller should hold the vcore lock.
+ */
+static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now)
+{
+	u64 p;
+
+	/*
+	 * If we are the task running the vcore, then since we hold
+	 * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb
+	 * can't be updated, so we don't need the tbacct_lock.
+	 * If the vcore is inactive, it can't become active (since we
+	 * hold the vcore lock), so the vcpu load/put functions won't
+	 * update stolen_tb/preempt_tb, and we don't need tbacct_lock.
+	 */
+	if (vc->vcore_state != VCORE_INACTIVE &&
+	    vc->runner->arch.run_task != current) {
+		spin_lock(&vc->runner->arch.tbacct_lock);
+		p = vc->stolen_tb;
+		if (vc->preempt_tb != TB_NIL)
+			p += now - vc->preempt_tb;
+		spin_unlock(&vc->runner->arch.tbacct_lock);
+	} else {
+		p = vc->stolen_tb;
+	}
+	return p;
+}
+
 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
 				    struct kvmppc_vcore *vc)
 {
 	struct dtl_entry *dt;
 	struct lppaca *vpa;
-	unsigned long old_stolen;
+	unsigned long stolen;
+	unsigned long core_stolen;
+	u64 now;
 
 	dt = vcpu->arch.dtl_ptr;
 	vpa = vcpu->arch.vpa.pinned_addr;
-	old_stolen = vcpu->arch.stolen_logged;
-	vcpu->arch.stolen_logged = vc->stolen_tb;
+	now = mftb();
+	core_stolen = vcore_stolen_time(vc, now);
+	stolen = core_stolen - vcpu->arch.stolen_logged;
+	vcpu->arch.stolen_logged = core_stolen;
+	spin_lock(&vcpu->arch.tbacct_lock);
+	stolen += vcpu->arch.busy_stolen;
+	vcpu->arch.busy_stolen = 0;
+	spin_unlock(&vcpu->arch.tbacct_lock);
 	if (!dt || !vpa)
 		return;
 	memset(dt, 0, sizeof(struct dtl_entry));
 	dt->dispatch_reason = 7;
 	dt->processor_id = vc->pcpu + vcpu->arch.ptid;
-	dt->timebase = mftb();
-	dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen;
+	dt->timebase = now;
+	dt->enqueue_to_dispatch_time = stolen;
 	dt->srr0 = kvmppc_get_pc(vcpu);
 	dt->srr1 = vcpu->arch.shregs.msr;
 	++dt;
@@ -773,6 +861,8 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	vcpu->arch.pvr = mfspr(SPRN_PVR);
 	kvmppc_set_pvr(vcpu, vcpu->arch.pvr);
 	spin_lock_init(&vcpu->arch.vpa_update_lock);
+	spin_lock_init(&vcpu->arch.tbacct_lock);
+	vcpu->arch.busy_preempt = TB_NIL;
 
 	kvmppc_mmu_book3s_hv_init(vcpu);
 
@@ -788,7 +878,7 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 			INIT_LIST_HEAD(&vcore->runnable_threads);
 			spin_lock_init(&vcore->lock);
 			init_waitqueue_head(&vcore->wq);
-			vcore->preempt_tb = mftb();
+			vcore->preempt_tb = TB_NIL;
 		}
 		kvm->arch.vcores[core] = vcore;
 	}
@@ -801,7 +891,6 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
 	++vcore->num_threads;
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
-	vcpu->arch.stolen_logged = vcore->stolen_tb;
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
@@ -861,9 +950,17 @@ extern void xics_wake_cpu(int cpu);
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
 				   struct kvm_vcpu *vcpu)
 {
+	u64 now;
+
 	if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 		return;
-	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
+	spin_lock(&vcpu->arch.tbacct_lock);
+	now = mftb();
+	vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) -
+		vcpu->arch.stolen_logged;
+	vcpu->arch.busy_preempt = now;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
+	spin_unlock(&vcpu->arch.tbacct_lock);
 	--vc->n_runnable;
 	list_del(&vcpu->arch.run_list);
 }
@@ -1038,10 +1135,8 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 			vcpu->arch.ptid = ptid++;
 		}
 	}
-	if (!vcpu0) {
-		vc->vcore_state = VCORE_INACTIVE;
-		return;		/* nothing to run; should never happen */
-	}
+	if (!vcpu0)
+		goto out;	/* nothing to run; should never happen */
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		if (vcpu->arch.ceded)
 			vcpu->arch.ptid = ptid++;
@@ -1056,7 +1151,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 		goto out;
 	}
 
-	vc->stolen_tb += mftb() - vc->preempt_tb;
 	vc->pcpu = smp_processor_id();
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 		kvmppc_start_thread(vcpu);
@@ -1121,7 +1215,6 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 
  out:
 	vc->vcore_state = VCORE_INACTIVE;
-	vc->preempt_tb = mftb();
 	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
 				 arch.run_list) {
 		if (vcpu->arch.ret != RESUME_GUEST) {
@@ -1181,7 +1274,9 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	vcpu->arch.ceded = 0;
 	vcpu->arch.run_task = current;
 	vcpu->arch.kvm_run = kvm_run;
+	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
 	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+	vcpu->arch.busy_preempt = TB_NIL;
 	list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads);
 	++vc->n_runnable;
 
@@ -1295,6 +1390,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	flush_vsx_to_thread(current);
 	vcpu->arch.wqp = &vcpu->arch.vcore->wq;
 	vcpu->arch.pgdir = current->mm->pgd;
+	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
 
 	do {
 		r = kvmppc_run_vcpu(run, vcpu);
@@ -1312,6 +1408,7 @@ int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	} while (r == RESUME_GUEST);
 
  out:
+	vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
 	atomic_dec(&vcpu->kvm->arch.vcpus_running);
 	return r;
 }

From 9f8c8c7812976fbfaf4bb30aaf8f6b001864f20a Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:18:37 +0000
Subject: [PATCH 11/13] KVM: PPC: Book3S HV: Allow DTL to be set to address 0,
 length 0

Commit 55b665b026 ("KVM: PPC: Book3S HV: Provide a way for userspace
to get/set per-vCPU areas") includes a check on the length of the
dispatch trace log (DTL) to make sure the buffer is at least one entry
long.  This is appropriate when registering a buffer, but the
interface also allows for any existing buffer to be unregistered by
specifying a zero address.  In this case the length check is not
appropriate.  This makes the check conditional on the address being
non-zero.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/book3s_hv.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8b3c470e6cb9..812764c96229 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -811,9 +811,8 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val)
 		addr = val->vpaval.addr;
 		len = val->vpaval.length;
 		r = -EINVAL;
-		if (len < sizeof(struct dtl_entry))
-			break;
-		if (addr && !vcpu->arch.vpa.next_gpa)
+		if (addr && (len < sizeof(struct dtl_entry) ||
+			     !vcpu->arch.vpa.next_gpa))
 			break;
 		len -= len % sizeof(struct dtl_entry);
 		r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);

From 8b5869ad85f703ffeb25e656eab826f6b85b984c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 15 Oct 2012 01:20:50 +0000
Subject: [PATCH 12/13] KVM: PPC: Book3S HV: Fix thinko in try_lock_hpte()

This fixes an error in the inline asm in try_lock_hpte() where we
were erroneously using a register number as an immediate operand.
The bug only affects an error path, and in fact the code will still
work as long as the compiler chooses some register other than r0
for the "bits" variable.  Nevertheless it should still be fixed.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/kvm_book3s_64.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0dd1d86d3e31..1472a5b4e4e3 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -60,7 +60,7 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
 		     "	ori	%0,%0,%4\n"
 		     "  stdcx.	%0,0,%2\n"
 		     "	beq+	2f\n"
-		     "	li	%1,%3\n"
+		     "	mr	%1,%3\n"
 		     "2:	isync"
 		     : "=&r" (tmp), "=&r" (old)
 		     : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK)

From 63a1909190a3baa0abb9463ef28c8d8c969be951 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 31 Oct 2012 13:37:59 +0100
Subject: [PATCH 13/13] PPC: ePAPR: Convert hcall header to uapi (round 2)

The new uapi framework splits kernel internal and user space exported
bits of header files more cleanly. Adjust the ePAPR header accordingly.

Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/include/asm/epapr_hcalls.h      | 458 ++++++++++++++++++
 arch/powerpc/include/uapi/asm/epapr_hcalls.h | 464 +++----------------
 2 files changed, 510 insertions(+), 412 deletions(-)
 create mode 100644 arch/powerpc/include/asm/epapr_hcalls.h

diff --git a/arch/powerpc/include/asm/epapr_hcalls.h b/arch/powerpc/include/asm/epapr_hcalls.h
new file mode 100644
index 000000000000..d3d634274d2c
--- /dev/null
+++ b/arch/powerpc/include/asm/epapr_hcalls.h
@@ -0,0 +1,458 @@
+/*
+ * ePAPR hcall interface
+ *
+ * Copyright 2008-2011 Freescale Semiconductor, Inc.
+ *
+ * Author: Timur Tabi <timur@freescale.com>
+ *
+ * This file is provided under a dual BSD/GPL license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Freescale Semiconductor nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ *
+ * ALTERNATIVELY, this software may be distributed under the terms of the
+ * GNU General Public License ("GPL") as published by the Free Software
+ * Foundation, either version 2 of that License or (at your option) any
+ * later version.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* A "hypercall" is an "sc 1" instruction.  This header file file provides C
+ * wrapper functions for the ePAPR hypervisor interface.  It is inteded
+ * for use by Linux device drivers and other operating systems.
+ *
+ * The hypercalls are implemented as inline assembly, rather than assembly
+ * language functions in a .S file, for optimization.  It allows
+ * the caller to issue the hypercall instruction directly, improving both
+ * performance and memory footprint.
+ */
+
+#ifndef _EPAPR_HCALLS_H
+#define _EPAPR_HCALLS_H
+
+#include <uapi/asm/epapr_hcalls.h>
+
+#ifndef __ASSEMBLY__
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <asm/byteorder.h>
+
+/*
+ * Hypercall register clobber list
+ *
+ * These macros are used to define the list of clobbered registers during a
+ * hypercall.  Technically, registers r0 and r3-r12 are always clobbered,
+ * but the gcc inline assembly syntax does not allow us to specify registers
+ * on the clobber list that are also on the input/output list.  Therefore,
+ * the lists of clobbered registers depends on the number of register
+ * parmeters ("+r" and "=r") passed to the hypercall.
+ *
+ * Each assembly block should use one of the HCALL_CLOBBERSx macros.  As a
+ * general rule, 'x' is the number of parameters passed to the assembly
+ * block *except* for r11.
+ *
+ * If you're not sure, just use the smallest value of 'x' that does not
+ * generate a compilation error.  Because these are static inline functions,
+ * the compiler will only check the clobber list for a function if you
+ * compile code that calls that function.
+ *
+ * r3 and r11 are not included in any clobbers list because they are always
+ * listed as output registers.
+ *
+ * XER, CTR, and LR are currently listed as clobbers because it's uncertain
+ * whether they will be clobbered.
+ *
+ * Note that r11 can be used as an output parameter.
+ *
+ * The "memory" clobber is only necessary for hcalls where the Hypervisor
+ * will read or write guest memory. However, we add it to all hcalls because
+ * the impact is minimal, and we want to ensure that it's present for the
+ * hcalls that need it.
+*/
+
+/* List of common clobbered registers.  Do not use this macro. */
+#define EV_HCALL_CLOBBERS "r0", "r12", "xer", "ctr", "lr", "cc", "memory"
+
+#define EV_HCALL_CLOBBERS8 EV_HCALL_CLOBBERS
+#define EV_HCALL_CLOBBERS7 EV_HCALL_CLOBBERS8, "r10"
+#define EV_HCALL_CLOBBERS6 EV_HCALL_CLOBBERS7, "r9"
+#define EV_HCALL_CLOBBERS5 EV_HCALL_CLOBBERS6, "r8"
+#define EV_HCALL_CLOBBERS4 EV_HCALL_CLOBBERS5, "r7"
+#define EV_HCALL_CLOBBERS3 EV_HCALL_CLOBBERS4, "r6"
+#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5"
+#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4"
+
+extern bool epapr_paravirt_enabled;
+extern u32 epapr_hypercall_start[];
+
+/*
+ * We use "uintptr_t" to define a register because it's guaranteed to be a
+ * 32-bit integer on a 32-bit platform, and a 64-bit integer on a 64-bit
+ * platform.
+ *
+ * All registers are either input/output or output only.  Registers that are
+ * initialized before making the hypercall are input/output.  All
+ * input/output registers are represented with "+r".  Output-only registers
+ * are represented with "=r".  Do not specify any unused registers.  The
+ * clobber list will tell the compiler that the hypercall modifies those
+ * registers, which is good enough.
+ */
+
+/**
+ * ev_int_set_config - configure the specified interrupt
+ * @interrupt: the interrupt number
+ * @config: configuration for this interrupt
+ * @priority: interrupt priority
+ * @destination: destination CPU number
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_int_set_config(unsigned int interrupt,
+	uint32_t config, unsigned int priority, uint32_t destination)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+	register uintptr_t r5 __asm__("r5");
+	register uintptr_t r6 __asm__("r6");
+
+	r11 = EV_HCALL_TOKEN(EV_INT_SET_CONFIG);
+	r3  = interrupt;
+	r4  = config;
+	r5  = priority;
+	r6  = destination;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6)
+		: : EV_HCALL_CLOBBERS4
+	);
+
+	return r3;
+}
+
+/**
+ * ev_int_get_config - return the config of the specified interrupt
+ * @interrupt: the interrupt number
+ * @config: returned configuration for this interrupt
+ * @priority: returned interrupt priority
+ * @destination: returned destination CPU number
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_int_get_config(unsigned int interrupt,
+	uint32_t *config, unsigned int *priority, uint32_t *destination)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+	register uintptr_t r5 __asm__("r5");
+	register uintptr_t r6 __asm__("r6");
+
+	r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG);
+	r3 = interrupt;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6)
+		: : EV_HCALL_CLOBBERS4
+	);
+
+	*config = r4;
+	*priority = r5;
+	*destination = r6;
+
+	return r3;
+}
+
+/**
+ * ev_int_set_mask - sets the mask for the specified interrupt source
+ * @interrupt: the interrupt number
+ * @mask: 0=enable interrupts, 1=disable interrupts
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_int_set_mask(unsigned int interrupt,
+	unsigned int mask)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+
+	r11 = EV_HCALL_TOKEN(EV_INT_SET_MASK);
+	r3 = interrupt;
+	r4 = mask;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "+r" (r4)
+		: : EV_HCALL_CLOBBERS2
+	);
+
+	return r3;
+}
+
+/**
+ * ev_int_get_mask - returns the mask for the specified interrupt source
+ * @interrupt: the interrupt number
+ * @mask: returned mask for this interrupt (0=enabled, 1=disabled)
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_int_get_mask(unsigned int interrupt,
+	unsigned int *mask)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+
+	r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK);
+	r3 = interrupt;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "=r" (r4)
+		: : EV_HCALL_CLOBBERS2
+	);
+
+	*mask = r4;
+
+	return r3;
+}
+
+/**
+ * ev_int_eoi - signal the end of interrupt processing
+ * @interrupt: the interrupt number
+ *
+ * This function signals the end of processing for the the specified
+ * interrupt, which must be the interrupt currently in service. By
+ * definition, this is also the highest-priority interrupt.
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_int_eoi(unsigned int interrupt)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+
+	r11 = EV_HCALL_TOKEN(EV_INT_EOI);
+	r3 = interrupt;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3)
+		: : EV_HCALL_CLOBBERS1
+	);
+
+	return r3;
+}
+
+/**
+ * ev_byte_channel_send - send characters to a byte stream
+ * @handle: byte stream handle
+ * @count: (input) num of chars to send, (output) num chars sent
+ * @buffer: pointer to a 16-byte buffer
+ *
+ * @buffer must be at least 16 bytes long, because all 16 bytes will be
+ * read from memory into registers, even if count < 16.
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_byte_channel_send(unsigned int handle,
+	unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES])
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+	register uintptr_t r5 __asm__("r5");
+	register uintptr_t r6 __asm__("r6");
+	register uintptr_t r7 __asm__("r7");
+	register uintptr_t r8 __asm__("r8");
+	const uint32_t *p = (const uint32_t *) buffer;
+
+	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_SEND);
+	r3 = handle;
+	r4 = *count;
+	r5 = be32_to_cpu(p[0]);
+	r6 = be32_to_cpu(p[1]);
+	r7 = be32_to_cpu(p[2]);
+	r8 = be32_to_cpu(p[3]);
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3),
+		  "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8)
+		: : EV_HCALL_CLOBBERS6
+	);
+
+	*count = r4;
+
+	return r3;
+}
+
+/**
+ * ev_byte_channel_receive - fetch characters from a byte channel
+ * @handle: byte channel handle
+ * @count: (input) max num of chars to receive, (output) num chars received
+ * @buffer: pointer to a 16-byte buffer
+ *
+ * The size of @buffer must be at least 16 bytes, even if you request fewer
+ * than 16 characters, because we always write 16 bytes to @buffer.  This is
+ * for performance reasons.
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_byte_channel_receive(unsigned int handle,
+	unsigned int *count, char buffer[EV_BYTE_CHANNEL_MAX_BYTES])
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+	register uintptr_t r5 __asm__("r5");
+	register uintptr_t r6 __asm__("r6");
+	register uintptr_t r7 __asm__("r7");
+	register uintptr_t r8 __asm__("r8");
+	uint32_t *p = (uint32_t *) buffer;
+
+	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_RECEIVE);
+	r3 = handle;
+	r4 = *count;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "+r" (r4),
+		  "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8)
+		: : EV_HCALL_CLOBBERS6
+	);
+
+	*count = r4;
+	p[0] = cpu_to_be32(r5);
+	p[1] = cpu_to_be32(r6);
+	p[2] = cpu_to_be32(r7);
+	p[3] = cpu_to_be32(r8);
+
+	return r3;
+}
+
+/**
+ * ev_byte_channel_poll - returns the status of the byte channel buffers
+ * @handle: byte channel handle
+ * @rx_count: returned count of bytes in receive queue
+ * @tx_count: returned count of free space in transmit queue
+ *
+ * This function reports the amount of data in the receive queue (i.e. the
+ * number of bytes you can read), and the amount of free space in the transmit
+ * queue (i.e. the number of bytes you can write).
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_byte_channel_poll(unsigned int handle,
+	unsigned int *rx_count,	unsigned int *tx_count)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+	register uintptr_t r5 __asm__("r5");
+
+	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL);
+	r3 = handle;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5)
+		: : EV_HCALL_CLOBBERS3
+	);
+
+	*rx_count = r4;
+	*tx_count = r5;
+
+	return r3;
+}
+
+/**
+ * ev_int_iack - acknowledge an interrupt
+ * @handle: handle to the target interrupt controller
+ * @vector: returned interrupt vector
+ *
+ * If handle is zero, the function returns the next interrupt source
+ * number to be handled irrespective of the hierarchy or cascading
+ * of interrupt controllers. If non-zero, specifies a handle to the
+ * interrupt controller that is the target of the acknowledge.
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_int_iack(unsigned int handle,
+	unsigned int *vector)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+	register uintptr_t r4 __asm__("r4");
+
+	r11 = EV_HCALL_TOKEN(EV_INT_IACK);
+	r3 = handle;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3), "=r" (r4)
+		: : EV_HCALL_CLOBBERS2
+	);
+
+	*vector = r4;
+
+	return r3;
+}
+
+/**
+ * ev_doorbell_send - send a doorbell to another partition
+ * @handle: doorbell send handle
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_doorbell_send(unsigned int handle)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+
+	r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND);
+	r3 = handle;
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "+r" (r3)
+		: : EV_HCALL_CLOBBERS1
+	);
+
+	return r3;
+}
+
+/**
+ * ev_idle -- wait for next interrupt on this core
+ *
+ * Returns 0 for success, or an error code.
+ */
+static inline unsigned int ev_idle(void)
+{
+	register uintptr_t r11 __asm__("r11");
+	register uintptr_t r3 __asm__("r3");
+
+	r11 = EV_HCALL_TOKEN(EV_IDLE);
+
+	asm volatile("bl	epapr_hypercall_start"
+		: "+r" (r11), "=r" (r3)
+		: : EV_HCALL_CLOBBERS1
+	);
+
+	return r3;
+}
+#endif /* !__ASSEMBLY__ */
+#endif /* _EPAPR_HCALLS_H */
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
index 58997afcd085..7f9c74b46704 100644
--- a/arch/powerpc/include/uapi/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -37,422 +37,62 @@
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-/* A "hypercall" is an "sc 1" instruction.  This header file file provides C
- * wrapper functions for the ePAPR hypervisor interface.  It is inteded
- * for use by Linux device drivers and other operating systems.
- *
- * The hypercalls are implemented as inline assembly, rather than assembly
- * language functions in a .S file, for optimization.  It allows
- * the caller to issue the hypercall instruction directly, improving both
- * performance and memory footprint.
- */
+#ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
+#define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H
 
-#ifndef _EPAPR_HCALLS_H
-#define _EPAPR_HCALLS_H
+#define EV_BYTE_CHANNEL_SEND		1
+#define EV_BYTE_CHANNEL_RECEIVE		2
+#define EV_BYTE_CHANNEL_POLL		3
+#define EV_INT_SET_CONFIG		4
+#define EV_INT_GET_CONFIG		5
+#define EV_INT_SET_MASK			6
+#define EV_INT_GET_MASK			7
+#define EV_INT_IACK			9
+#define EV_INT_EOI			10
+#define EV_INT_SEND_IPI			11
+#define EV_INT_SET_TASK_PRIORITY	12
+#define EV_INT_GET_TASK_PRIORITY	13
+#define EV_DOORBELL_SEND		14
+#define EV_MSGSND			15
+#define EV_IDLE				16
 
-#include <uapi/asm/epapr_hcalls.h>
+/* vendor ID: epapr */
+#define EV_LOCAL_VENDOR_ID		0	/* for private use */
+#define EV_EPAPR_VENDOR_ID		1
+#define EV_FSL_VENDOR_ID		2	/* Freescale Semiconductor */
+#define EV_IBM_VENDOR_ID		3	/* IBM */
+#define EV_GHS_VENDOR_ID		4	/* Green Hills Software */
+#define EV_ENEA_VENDOR_ID		5	/* Enea */
+#define EV_WR_VENDOR_ID			6	/* Wind River Systems */
+#define EV_AMCC_VENDOR_ID		7	/* Applied Micro Circuits */
+#define EV_KVM_VENDOR_ID		42	/* KVM */
 
-#ifndef __ASSEMBLY__
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <asm/byteorder.h>
+/* The max number of bytes that a byte channel can send or receive per call */
+#define EV_BYTE_CHANNEL_MAX_BYTES	16
 
-/*
- * Hypercall register clobber list
- *
- * These macros are used to define the list of clobbered registers during a
- * hypercall.  Technically, registers r0 and r3-r12 are always clobbered,
- * but the gcc inline assembly syntax does not allow us to specify registers
- * on the clobber list that are also on the input/output list.  Therefore,
- * the lists of clobbered registers depends on the number of register
- * parmeters ("+r" and "=r") passed to the hypercall.
- *
- * Each assembly block should use one of the HCALL_CLOBBERSx macros.  As a
- * general rule, 'x' is the number of parameters passed to the assembly
- * block *except* for r11.
- *
- * If you're not sure, just use the smallest value of 'x' that does not
- * generate a compilation error.  Because these are static inline functions,
- * the compiler will only check the clobber list for a function if you
- * compile code that calls that function.
- *
- * r3 and r11 are not included in any clobbers list because they are always
- * listed as output registers.
- *
- * XER, CTR, and LR are currently listed as clobbers because it's uncertain
- * whether they will be clobbered.
- *
- * Note that r11 can be used as an output parameter.
- *
- * The "memory" clobber is only necessary for hcalls where the Hypervisor
- * will read or write guest memory. However, we add it to all hcalls because
- * the impact is minimal, and we want to ensure that it's present for the
- * hcalls that need it.
-*/
 
-/* List of common clobbered registers.  Do not use this macro. */
-#define EV_HCALL_CLOBBERS "r0", "r12", "xer", "ctr", "lr", "cc", "memory"
+#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
+#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
 
-#define EV_HCALL_CLOBBERS8 EV_HCALL_CLOBBERS
-#define EV_HCALL_CLOBBERS7 EV_HCALL_CLOBBERS8, "r10"
-#define EV_HCALL_CLOBBERS6 EV_HCALL_CLOBBERS7, "r9"
-#define EV_HCALL_CLOBBERS5 EV_HCALL_CLOBBERS6, "r8"
-#define EV_HCALL_CLOBBERS4 EV_HCALL_CLOBBERS5, "r7"
-#define EV_HCALL_CLOBBERS3 EV_HCALL_CLOBBERS4, "r6"
-#define EV_HCALL_CLOBBERS2 EV_HCALL_CLOBBERS3, "r5"
-#define EV_HCALL_CLOBBERS1 EV_HCALL_CLOBBERS2, "r4"
+/* epapr return codes */
+#define EV_SUCCESS		0
+#define EV_EPERM		1	/* Operation not permitted */
+#define EV_ENOENT		2	/*  Entry Not Found */
+#define EV_EIO			3	/* I/O error occured */
+#define EV_EAGAIN		4	/* The operation had insufficient
+					 * resources to complete and should be
+					 * retried
+					 */
+#define EV_ENOMEM		5	/* There was insufficient memory to
+					 * complete the operation */
+#define EV_EFAULT		6	/* Bad guest address */
+#define EV_ENODEV		7	/* No such device */
+#define EV_EINVAL		8	/* An argument supplied to the hcall
+					   was out of range or invalid */
+#define EV_INTERNAL		9	/* An internal error occured */
+#define EV_CONFIG		10	/* A configuration error was detected */
+#define EV_INVALID_STATE	11	/* The object is in an invalid state */
+#define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
+#define EV_BUFFER_OVERFLOW	13	/* Caller-supplied buffer too small */
 
-extern bool epapr_paravirt_enabled;
-extern u32 epapr_hypercall_start[];
-
-/*
- * We use "uintptr_t" to define a register because it's guaranteed to be a
- * 32-bit integer on a 32-bit platform, and a 64-bit integer on a 64-bit
- * platform.
- *
- * All registers are either input/output or output only.  Registers that are
- * initialized before making the hypercall are input/output.  All
- * input/output registers are represented with "+r".  Output-only registers
- * are represented with "=r".  Do not specify any unused registers.  The
- * clobber list will tell the compiler that the hypercall modifies those
- * registers, which is good enough.
- */
-
-/**
- * ev_int_set_config - configure the specified interrupt
- * @interrupt: the interrupt number
- * @config: configuration for this interrupt
- * @priority: interrupt priority
- * @destination: destination CPU number
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_int_set_config(unsigned int interrupt,
-	uint32_t config, unsigned int priority, uint32_t destination)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-	register uintptr_t r5 __asm__("r5");
-	register uintptr_t r6 __asm__("r6");
-
-	r11 = EV_HCALL_TOKEN(EV_INT_SET_CONFIG);
-	r3  = interrupt;
-	r4  = config;
-	r5  = priority;
-	r6  = destination;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6)
-		: : EV_HCALL_CLOBBERS4
-	);
-
-	return r3;
-}
-
-/**
- * ev_int_get_config - return the config of the specified interrupt
- * @interrupt: the interrupt number
- * @config: returned configuration for this interrupt
- * @priority: returned interrupt priority
- * @destination: returned destination CPU number
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_int_get_config(unsigned int interrupt,
-	uint32_t *config, unsigned int *priority, uint32_t *destination)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-	register uintptr_t r5 __asm__("r5");
-	register uintptr_t r6 __asm__("r6");
-
-	r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG);
-	r3 = interrupt;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6)
-		: : EV_HCALL_CLOBBERS4
-	);
-
-	*config = r4;
-	*priority = r5;
-	*destination = r6;
-
-	return r3;
-}
-
-/**
- * ev_int_set_mask - sets the mask for the specified interrupt source
- * @interrupt: the interrupt number
- * @mask: 0=enable interrupts, 1=disable interrupts
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_int_set_mask(unsigned int interrupt,
-	unsigned int mask)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-
-	r11 = EV_HCALL_TOKEN(EV_INT_SET_MASK);
-	r3 = interrupt;
-	r4 = mask;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "+r" (r4)
-		: : EV_HCALL_CLOBBERS2
-	);
-
-	return r3;
-}
-
-/**
- * ev_int_get_mask - returns the mask for the specified interrupt source
- * @interrupt: the interrupt number
- * @mask: returned mask for this interrupt (0=enabled, 1=disabled)
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_int_get_mask(unsigned int interrupt,
-	unsigned int *mask)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-
-	r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK);
-	r3 = interrupt;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "=r" (r4)
-		: : EV_HCALL_CLOBBERS2
-	);
-
-	*mask = r4;
-
-	return r3;
-}
-
-/**
- * ev_int_eoi - signal the end of interrupt processing
- * @interrupt: the interrupt number
- *
- * This function signals the end of processing for the the specified
- * interrupt, which must be the interrupt currently in service. By
- * definition, this is also the highest-priority interrupt.
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_int_eoi(unsigned int interrupt)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-
-	r11 = EV_HCALL_TOKEN(EV_INT_EOI);
-	r3 = interrupt;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3)
-		: : EV_HCALL_CLOBBERS1
-	);
-
-	return r3;
-}
-
-/**
- * ev_byte_channel_send - send characters to a byte stream
- * @handle: byte stream handle
- * @count: (input) num of chars to send, (output) num chars sent
- * @buffer: pointer to a 16-byte buffer
- *
- * @buffer must be at least 16 bytes long, because all 16 bytes will be
- * read from memory into registers, even if count < 16.
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_byte_channel_send(unsigned int handle,
-	unsigned int *count, const char buffer[EV_BYTE_CHANNEL_MAX_BYTES])
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-	register uintptr_t r5 __asm__("r5");
-	register uintptr_t r6 __asm__("r6");
-	register uintptr_t r7 __asm__("r7");
-	register uintptr_t r8 __asm__("r8");
-	const uint32_t *p = (const uint32_t *) buffer;
-
-	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_SEND);
-	r3 = handle;
-	r4 = *count;
-	r5 = be32_to_cpu(p[0]);
-	r6 = be32_to_cpu(p[1]);
-	r7 = be32_to_cpu(p[2]);
-	r8 = be32_to_cpu(p[3]);
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3),
-		  "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8)
-		: : EV_HCALL_CLOBBERS6
-	);
-
-	*count = r4;
-
-	return r3;
-}
-
-/**
- * ev_byte_channel_receive - fetch characters from a byte channel
- * @handle: byte channel handle
- * @count: (input) max num of chars to receive, (output) num chars received
- * @buffer: pointer to a 16-byte buffer
- *
- * The size of @buffer must be at least 16 bytes, even if you request fewer
- * than 16 characters, because we always write 16 bytes to @buffer.  This is
- * for performance reasons.
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_byte_channel_receive(unsigned int handle,
-	unsigned int *count, char buffer[EV_BYTE_CHANNEL_MAX_BYTES])
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-	register uintptr_t r5 __asm__("r5");
-	register uintptr_t r6 __asm__("r6");
-	register uintptr_t r7 __asm__("r7");
-	register uintptr_t r8 __asm__("r8");
-	uint32_t *p = (uint32_t *) buffer;
-
-	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_RECEIVE);
-	r3 = handle;
-	r4 = *count;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "+r" (r4),
-		  "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8)
-		: : EV_HCALL_CLOBBERS6
-	);
-
-	*count = r4;
-	p[0] = cpu_to_be32(r5);
-	p[1] = cpu_to_be32(r6);
-	p[2] = cpu_to_be32(r7);
-	p[3] = cpu_to_be32(r8);
-
-	return r3;
-}
-
-/**
- * ev_byte_channel_poll - returns the status of the byte channel buffers
- * @handle: byte channel handle
- * @rx_count: returned count of bytes in receive queue
- * @tx_count: returned count of free space in transmit queue
- *
- * This function reports the amount of data in the receive queue (i.e. the
- * number of bytes you can read), and the amount of free space in the transmit
- * queue (i.e. the number of bytes you can write).
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_byte_channel_poll(unsigned int handle,
-	unsigned int *rx_count,	unsigned int *tx_count)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-	register uintptr_t r5 __asm__("r5");
-
-	r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL);
-	r3 = handle;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5)
-		: : EV_HCALL_CLOBBERS3
-	);
-
-	*rx_count = r4;
-	*tx_count = r5;
-
-	return r3;
-}
-
-/**
- * ev_int_iack - acknowledge an interrupt
- * @handle: handle to the target interrupt controller
- * @vector: returned interrupt vector
- *
- * If handle is zero, the function returns the next interrupt source
- * number to be handled irrespective of the hierarchy or cascading
- * of interrupt controllers. If non-zero, specifies a handle to the
- * interrupt controller that is the target of the acknowledge.
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_int_iack(unsigned int handle,
-	unsigned int *vector)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-	register uintptr_t r4 __asm__("r4");
-
-	r11 = EV_HCALL_TOKEN(EV_INT_IACK);
-	r3 = handle;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3), "=r" (r4)
-		: : EV_HCALL_CLOBBERS2
-	);
-
-	*vector = r4;
-
-	return r3;
-}
-
-/**
- * ev_doorbell_send - send a doorbell to another partition
- * @handle: doorbell send handle
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_doorbell_send(unsigned int handle)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-
-	r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND);
-	r3 = handle;
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "+r" (r3)
-		: : EV_HCALL_CLOBBERS1
-	);
-
-	return r3;
-}
-
-/**
- * ev_idle -- wait for next interrupt on this core
- *
- * Returns 0 for success, or an error code.
- */
-static inline unsigned int ev_idle(void)
-{
-	register uintptr_t r11 __asm__("r11");
-	register uintptr_t r3 __asm__("r3");
-
-	r11 = EV_HCALL_TOKEN(EV_IDLE);
-
-	asm volatile("bl	epapr_hypercall_start"
-		: "+r" (r11), "=r" (r3)
-		: : EV_HCALL_CLOBBERS1
-	);
-
-	return r3;
-}
-#endif /* !__ASSEMBLY__ */
-#endif
+#endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */