From 2786b014ec893c301ea52ef9962e7cc60f89f9b3 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Mon, 22 Sep 2008 16:08:06 +0200
Subject: [PATCH 001/140] KVM: x86 emulator: consolidate push reg

This patch consolidate the emulation of push reg instruction.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@bull.net>
Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ea051173b0da..a391e213fe61 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1415,13 +1415,7 @@ special_insn:
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
 	case 0x50 ... 0x57:  /* push reg */
-		c->dst.type  = OP_MEM;
-		c->dst.bytes = c->op_bytes;
-		c->dst.val = c->src.val;
-		register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-					   -c->op_bytes);
-		c->dst.ptr = (void *) register_address(
-			c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
+		emulate_push(ctxt);
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:

From a26bf12afb608eb5a96192eaee35fc08ffbf85aa Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:45 +0200
Subject: [PATCH 002/140] KVM: VMX: include all IRQ window exits in statistics

irq_window_exits only tracks IRQ window exits due to user space
requests, nmi_window_exits include all exits. The latter makes more
sense, so let's adjust irq_window_exits accounting.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a4018b01e1f9..ac3453799c17 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2767,6 +2767,7 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 
 	KVMTRACE_0D(PEND_INTR, vcpu, handler);
+	++vcpu->stat.irq_window_exits;
 
 	/*
 	 * If the user space waits to inject interrupts, exit as soon as
@@ -2775,7 +2776,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	if (kvm_run->request_interrupt_window &&
 	    !vcpu->arch.irq_summary) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		++vcpu->stat.irq_window_exits;
 		return 0;
 	}
 	return 1;

From e4a41889ece6c95f390a7fa3a94255ab62470968 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:46 +0200
Subject: [PATCH 003/140] KVM: VMX: Use INTR_TYPE_NMI_INTR instead of magic
 value

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ac3453799c17..81cf12b8d12a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2492,7 +2492,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
 	}
 
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_no_device(intr_info)) {
@@ -3337,7 +3337,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
 	/* We need to handle NMIs before interrupts are enabled */
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
 	    (intr_info & INTR_INFO_VALID_MASK)) {
 		KVMTRACE_0D(NMI, vcpu, handler);
 		asm("int $2");

From 60637aacfd95c368e1fbc2157275d1b621b5dcdd Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:47 +0200
Subject: [PATCH 004/140] KVM: VMX: Support for NMI task gates

Properly set GUEST_INTR_STATE_NMI and reset nmi_injected when a
task-switch vmexit happened due to a task gate being used for handling
NMIs. Also avoid the false warning about valid vectoring info in
kvm_handle_exit.

Based on original patch by Gleb Natapov.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 81cf12b8d12a..8d0fc68fd4ec 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2832,6 +2832,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
 	u16 tss_selector;
 	int reason;
@@ -2839,6 +2840,15 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	reason = (u32)exit_qualification >> 30;
+	if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
+	    (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
+	    (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
+	    == INTR_TYPE_NMI_INTR) {
+		vcpu->arch.nmi_injected = false;
+		if (cpu_has_virtual_nmis())
+			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+				      GUEST_INTR_STATE_NMI);
+	}
 	tss_selector = exit_qualification;
 
 	return kvm_task_switch(vcpu, tss_selector, reason);
@@ -3012,9 +3022,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
 			(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
-			exit_reason != EXIT_REASON_EPT_VIOLATION))
-		printk(KERN_WARNING "%s: unexpected, valid vectoring info and "
-		       "exit reason is 0x%x\n", __func__, exit_reason);
+			exit_reason != EXIT_REASON_EPT_VIOLATION &&
+			exit_reason != EXIT_REASON_TASK_SWITCH))
+		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
+		       "(0x%x) and exit reason is 0x%x\n",
+		       __func__, vectoring_info, exit_reason);
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);

From 448fa4a9c5dbc6941dd19ed09692c588d815bb06 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:48 +0200
Subject: [PATCH 005/140] KVM: x86: Reset pending/inject NMI state on CPU reset

CPU reset invalidates pending or already injected NMIs, therefore reset
the related state variables.

Based on original patch by Gleb Natapov.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1f8ff2f1fa2..1a71f6735593 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3925,6 +3925,9 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
 {
+	vcpu->arch.nmi_pending = false;
+	vcpu->arch.nmi_injected = false;
+
 	return kvm_x86_ops->vcpu_reset(vcpu);
 }
 

From 33f089ca5a61f7aead26e8e1866dfc961dd88a9e Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:49 +0200
Subject: [PATCH 006/140] KVM: VMX: refactor/fix IRQ and NMI injectability
 determination

There are currently two ways in VMX to check if an IRQ or NMI can be
injected:
 - vmx_{nmi|irq}_enabled and
 - vcpu.arch.{nmi|interrupt}_window_open.
Even worse, one test (at the end of vmx_vcpu_run) uses an inconsistent,
likely incorrect logic.

This patch consolidates and unifies the tests over
{nmi|interrupt}_window_open as cache + vmx_update_window_states
for updating the cache content.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/vmx.c              | 46 +++++++++++++++------------------
 2 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8346be87cfa1..bfbbdea869bf 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -327,6 +327,7 @@ struct kvm_vcpu_arch {
 
 	bool nmi_pending;
 	bool nmi_injected;
+	bool nmi_window_open;
 
 	u64 mtrr[0x100];
 };
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8d0fc68fd4ec..f0866e1d20ee 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2362,6 +2362,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
+static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+{
+	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+
+	vcpu->arch.nmi_window_open =
+		!(guest_intr & (GUEST_INTR_STATE_STI |
+				GUEST_INTR_STATE_MOV_SS |
+				GUEST_INTR_STATE_NMI));
+
+	vcpu->arch.interrupt_window_open =
+		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		 !(guest_intr & (GUEST_INTR_STATE_STI |
+				 GUEST_INTR_STATE_MOV_SS)));
+}
+
 static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 {
 	int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2374,15 +2389,12 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 	kvm_queue_interrupt(vcpu, irq);
 }
 
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
 	u32 cpu_based_vm_exec_control;
 
-	vcpu->arch.interrupt_window_open =
-		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
+	vmx_update_window_states(vcpu);
 
 	if (vcpu->arch.interrupt_window_open &&
 	    vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -3075,22 +3087,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
-static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
-{
-	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	return !(guest_intr & (GUEST_INTR_STATE_NMI |
-			       GUEST_INTR_STATE_MOV_SS |
-			       GUEST_INTR_STATE_STI));
-}
-
-static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
-{
-	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
-			       GUEST_INTR_STATE_STI)) &&
-		(vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
-}
-
 static void enable_intr_window(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.nmi_pending)
@@ -3159,11 +3155,13 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 {
 	update_tpr_threshold(vcpu);
 
+	vmx_update_window_states(vcpu);
+
 	if (cpu_has_virtual_nmis()) {
 		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
 			if (vcpu->arch.interrupt.pending) {
 				enable_nmi_window(vcpu);
-			} else if (vmx_nmi_enabled(vcpu)) {
+			} else if (vcpu->arch.nmi_window_open) {
 				vcpu->arch.nmi_pending = false;
 				vcpu->arch.nmi_injected = true;
 			} else {
@@ -3178,7 +3176,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 		}
 	}
 	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-		if (vmx_irq_enabled(vcpu))
+		if (vcpu->arch.interrupt_window_open)
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 		else
 			enable_irq_window(vcpu);
@@ -3339,9 +3337,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
-	vcpu->arch.interrupt_window_open =
-		(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-		 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
+	vmx_update_window_states(vcpu);
 
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;

From f460ee43e250b675376246b1c4c9fe9b9af4ab16 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:50 +0200
Subject: [PATCH 007/140] KVM: VMX: refactor IRQ and NMI window enabling

do_interrupt_requests and vmx_intr_assist go different way for
achieving the same: enabling the nmi/irq window start notification.
Unify their code over enable_{irq|nmi}_window, get rid of a redundant
call to enable_intr_window instead of direct enable_nmi_window
invocation and unroll enable_intr_window for both in-kernel and user
space irq injection accordingly.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 78 +++++++++++++++++++---------------------------
 1 file changed, 32 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f0866e1d20ee..440f56cd4bdd 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2389,30 +2389,42 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 	kvm_queue_interrupt(vcpu, irq);
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
 
-	vmx_update_window_states(vcpu);
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
 
-	if (vcpu->arch.interrupt_window_open &&
-	    vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-		kvm_do_inject_irq(vcpu);
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+	u32 cpu_based_vm_exec_control;
 
-	if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+	if (!cpu_has_virtual_nmis())
+		return;
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void do_interrupt_requests(struct kvm_vcpu *vcpu,
+				       struct kvm_run *kvm_run)
+{
+	vmx_update_window_states(vcpu);
+
+	if (vcpu->arch.interrupt_window_open) {
+		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
+			kvm_do_inject_irq(vcpu);
+
+		if (vcpu->arch.interrupt.pending)
+			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+	}
 	if (!vcpu->arch.interrupt_window_open &&
 	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
-		/*
-		 * Interrupts blocked.  Wait for unblock.
-		 */
-		cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	else
-		cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+		enable_irq_window(vcpu);
 }
 
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3066,35 +3078,6 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
 	vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	if (!cpu_has_virtual_nmis())
-		return;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_intr_window(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.nmi_pending)
-		enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu))
-		enable_irq_window(vcpu);
-}
-
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
@@ -3165,13 +3148,16 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 				vcpu->arch.nmi_pending = false;
 				vcpu->arch.nmi_injected = true;
 			} else {
-				enable_intr_window(vcpu);
+				enable_nmi_window(vcpu);
 				return;
 			}
 		}
 		if (vcpu->arch.nmi_injected) {
 			vmx_inject_nmi(vcpu);
-			enable_intr_window(vcpu);
+			if (vcpu->arch.nmi_pending)
+				enable_nmi_window(vcpu);
+			else if (kvm_cpu_has_interrupt(vcpu))
+				enable_irq_window(vcpu);
 			return;
 		}
 	}

From 66a5a347c2690db4c0756524a8eb5a05e0437aa8 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:51 +0200
Subject: [PATCH 008/140] KVM: VMX: fix real-mode NMI support

Fix NMI injection in real-mode with the same pattern we perform IRQ
injection.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 440f56cd4bdd..38d138566617 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2358,6 +2358,19 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (vcpu->arch.rmode.active) {
+		vmx->rmode.irq.pending = true;
+		vmx->rmode.irq.vector = NMI_VECTOR;
+		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
+		vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
+			     NMI_VECTOR | INTR_TYPE_SOFT_INTR |
+			     INTR_INFO_VALID_MASK);
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
+		return;
+	}
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }

From 23930f9521c9c4d4aa96cdb9d1e1703f3782bb94 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:52 +0200
Subject: [PATCH 009/140] KVM: x86: Enable NMI Watchdog via in-kernel PIT
 source

LINT0 of the LAPIC can be used to route PIT events as NMI watchdog ticks
into the guest. This patch aligns the in-kernel irqchip emulation with
the user space irqchip with already supports this feature. The trick is
to route PIT interrupts to all LAPIC's LVT0 lines.

Rebased and slightly polished patch originally posted by Sheng Yang.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 15 +++++++++++++++
 arch/x86/kvm/irq.h   |  1 +
 arch/x86/kvm/lapic.c | 34 +++++++++++++++++++++++++++++-----
 3 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 59ebd37ad79e..580cc1d01c7d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -603,10 +603,25 @@ void kvm_free_pit(struct kvm *kvm)
 
 static void __inject_pit_timer_intr(struct kvm *kvm)
 {
+	struct kvm_vcpu *vcpu;
+	int i;
+
 	mutex_lock(&kvm->lock);
 	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
 	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
 	mutex_unlock(&kvm->lock);
+
+	/*
+	 * Provides NMI watchdog support in IOAPIC mode.
+	 * The route is: PIT -> PIC -> LVT0 in NMI mode,
+	 * timer IRQs will continue to flow through the IOAPIC.
+	 */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		vcpu = kvm->vcpus[i];
+		if (!vcpu)
+			continue;
+		kvm_apic_local_deliver(vcpu, APIC_LVT0);
+	}
 }
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index f17c8f5bbf31..71e37a530cf7 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -87,6 +87,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
+int kvm_apic_local_deliver(struct kvm_vcpu *vcpu, int lvt_type);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0fc3cab48943..206cc11a1c97 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -380,6 +380,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		}
 		break;
 
+	case APIC_DM_EXTINT:
+		/*
+		 * Should only be called by kvm_apic_local_deliver() with LVT0,
+		 * before NMI watchdog was enabled. Already handled by
+		 * kvm_apic_accept_pic_intr().
+		 */
+		break;
+
 	default:
 		printk(KERN_ERR "TODO: unsupported delivery mode %x\n",
 		       delivery_mode);
@@ -743,10 +751,13 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		apic_set_reg(apic, APIC_ICR2, val & 0xff000000);
 		break;
 
+	case APIC_LVT0:
+		if (val == APIC_DM_NMI)
+			apic_debug("Receive NMI setting on APIC_LVT0 "
+				"for cpu %d\n", apic->vcpu->vcpu_id);
 	case APIC_LVTT:
 	case APIC_LVTTHMR:
 	case APIC_LVTPC:
-	case APIC_LVT0:
 	case APIC_LVT1:
 	case APIC_LVTERR:
 		/* TODO: Check vector */
@@ -961,12 +972,25 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static int __inject_apic_timer_irq(struct kvm_lapic *apic)
+int kvm_apic_local_deliver(struct kvm_vcpu *vcpu, int lvt_type)
 {
-	int vector;
+	struct kvm_lapic *apic = vcpu->arch.apic;
+	int vector, mode, trig_mode;
+	u32 reg;
 
-	vector = apic_lvt_vector(apic, APIC_LVTT);
-	return __apic_accept_irq(apic, APIC_DM_FIXED, vector, 1, 0);
+	if (apic && apic_enabled(apic)) {
+		reg = apic_get_reg(apic, lvt_type);
+		vector = reg & APIC_VECTOR_MASK;
+		mode = reg & APIC_MODE_MASK;
+		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
+		return __apic_accept_irq(apic, mode, vector, 1, trig_mode);
+	}
+	return 0;
+}
+
+static inline int __inject_apic_timer_irq(struct kvm_lapic *apic)
+{
+	return kvm_apic_local_deliver(apic->vcpu, APIC_LVTT);
 }
 
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)

From 0496fbb973ccc9477082e859ed0faab5acb805ba Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:53 +0200
Subject: [PATCH 010/140] KVM: x86: VCPU with pending NMI is runnabled

Ensure that a VCPU with pending NMIs is considered runnable.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1a71f6735593..1fa9a6db633d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4130,7 +4130,8 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
-	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED;
+	       || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
+	       || vcpu->arch.nmi_pending;
 }
 
 static void vcpu_kick_intr(void *info)

From 26df99c6c5807115f06d4e1abae397b7f5f3e00c Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:54 +0200
Subject: [PATCH 011/140] KVM: Kick NMI receiving VCPU

Kick the NMI receiving VCPU in case the triggering caller runs in a
different context.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 1 +
 virt/kvm/ioapic.c    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 206cc11a1c97..304f9ddbdd51 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -354,6 +354,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 
 	case APIC_DM_NMI:
 		kvm_inject_nmi(vcpu);
+		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_INIT:
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 53772bb46320..c8f939c55075 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -150,6 +150,7 @@ static int ioapic_inj_irq(struct kvm_ioapic *ioapic,
 static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 {
 	kvm_inject_nmi(vcpu);
+	kvm_vcpu_kick(vcpu);
 }
 
 static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,

From c4abb7c9cde24b7351a47328ef866e6a2bbb1ad0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:55 +0200
Subject: [PATCH 012/140] KVM: x86: Support for user space injected NMIs

Introduces the KVM_NMI IOCTL to the generic x86 part of KVM for
injecting NMIs from user space and also extends the statistic report
accordingly.

Based on the original patch by Sheng Yang.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/x86.c              | 46 +++++++++++++++++++++++++++++++--
 include/linux/kvm.h             | 11 ++++++--
 3 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bfbbdea869bf..a40fa8478920 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -398,6 +398,7 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
+	u32 request_nmi_exits;
 	u32 irq_exits;
 	u32 host_state_reload;
 	u32 efer_reload;
@@ -406,6 +407,7 @@ struct kvm_vcpu_stat {
 	u32 insn_emulation_fail;
 	u32 hypercalls;
 	u32 irq_injections;
+	u32 nmi_injections;
 };
 
 struct descriptor_table {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1fa9a6db633d..07971451b947 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -86,6 +86,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
+	{ "request_nmi", VCPU_STAT(request_nmi_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 	{ "efer_reload", VCPU_STAT(efer_reload) },
@@ -93,6 +94,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "insn_emulation", VCPU_STAT(insn_emulation) },
 	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
 	{ "irq_injections", VCPU_STAT(irq_injections) },
+	{ "nmi_injections", VCPU_STAT(nmi_injections) },
 	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
 	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
 	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -1318,6 +1320,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
+{
+	vcpu_load(vcpu);
+	kvm_inject_nmi(vcpu);
+	vcpu_put(vcpu);
+
+	return 0;
+}
+
 static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
 					   struct kvm_tpr_access_ctl *tac)
 {
@@ -1377,6 +1388,13 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = 0;
 		break;
 	}
+	case KVM_NMI: {
+		r = kvm_vcpu_ioctl_nmi(vcpu);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
 	case KVM_SET_CPUID: {
 		struct kvm_cpuid __user *cpuid_arg = argp;
 		struct kvm_cpuid cpuid;
@@ -2812,18 +2830,37 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
 }
 
+/*
+ * Check if userspace requested a NMI window, and that the NMI window
+ * is open.
+ *
+ * No need to exit to userspace if we already have a NMI queued.
+ */
+static int dm_request_for_nmi_injection(struct kvm_vcpu *vcpu,
+					struct kvm_run *kvm_run)
+{
+	return (!vcpu->arch.nmi_pending &&
+		kvm_run->request_nmi_window &&
+		vcpu->arch.nmi_window_open);
+}
+
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
 	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (irqchip_in_kernel(vcpu->kvm)) {
 		kvm_run->ready_for_interrupt_injection = 1;
-	else
+		kvm_run->ready_for_nmi_injection = 1;
+	} else {
 		kvm_run->ready_for_interrupt_injection =
 					(vcpu->arch.interrupt_window_open &&
 					 vcpu->arch.irq_summary == 0);
+		kvm_run->ready_for_nmi_injection =
+					(vcpu->arch.nmi_window_open &&
+					 vcpu->arch.nmi_pending == 0);
+	}
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -2999,6 +3036,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 
 		if (r > 0) {
+			if (dm_request_for_nmi_injection(vcpu, kvm_run)) {
+				r = -EINTR;
+				kvm_run->exit_reason = KVM_EXIT_NMI;
+				++vcpu->stat.request_nmi_exits;
+			}
 			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 				r = -EINTR;
 				kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index f18b86fa8655..44fd7fa0af2b 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -83,18 +83,22 @@ struct kvm_irqchip {
 #define KVM_EXIT_S390_SIEIC       13
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
+#define KVM_EXIT_NMI              16
+#define KVM_EXIT_NMI_WINDOW_OPEN  17
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
 	__u8 request_interrupt_window;
-	__u8 padding1[7];
+	__u8 request_nmi_window;
+	__u8 padding1[6];
 
 	/* out */
 	__u32 exit_reason;
 	__u8 ready_for_interrupt_injection;
 	__u8 if_flag;
-	__u8 padding2[2];
+	__u8 ready_for_nmi_injection;
+	__u8 padding2;
 
 	/* in (pre_kvm_run), out (post_kvm_run) */
 	__u64 cr8;
@@ -387,6 +391,7 @@ struct kvm_trace_rec {
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
+#define KVM_CAP_NMI 19
 
 /*
  * ioctls for VM fds
@@ -458,6 +463,8 @@ struct kvm_trace_rec {
 #define KVM_S390_INITIAL_RESET    _IO(KVMIO,  0x97)
 #define KVM_GET_MP_STATE          _IOR(KVMIO,  0x98, struct kvm_mp_state)
 #define KVM_SET_MP_STATE          _IOW(KVMIO,  0x99, struct kvm_mp_state)
+/* Available with KVM_CAP_NMI */
+#define KVM_NMI                   _IO(KVMIO,  0x9a)
 
 #define KVM_TRC_INJ_VIRQ         (KVM_TRC_HANDLER + 0x02)
 #define KVM_TRC_REDELIVER_EVT    (KVM_TRC_HANDLER + 0x03)

From 487b391d6ea9b1d0e2e0440466fb3130e78c98d9 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:56 +0200
Subject: [PATCH 013/140] KVM: VMX: Provide support for user space injected
 NMIs

This patch adds the required bits to the VMX side for user space
injected NMIs. As with the preexisting in-kernel irqchip support, the
CPU must provide the "virtual NMI" feature for proper tracking of the
NMI blocking state.

Based on the original patch by Sheng Yang.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 38d138566617..f16a62c79267 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2360,6 +2360,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	++vcpu->stat.nmi_injections;
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = NMI_VECTOR;
@@ -2428,6 +2429,30 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 {
 	vmx_update_window_states(vcpu);
 
+	if (cpu_has_virtual_nmis()) {
+		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+			if (vcpu->arch.nmi_window_open) {
+				vcpu->arch.nmi_pending = false;
+				vcpu->arch.nmi_injected = true;
+			} else {
+				enable_nmi_window(vcpu);
+				return;
+			}
+		}
+		if (vcpu->arch.nmi_injected) {
+			vmx_inject_nmi(vcpu);
+			if (vcpu->arch.nmi_pending
+			    || kvm_run->request_nmi_window)
+				enable_nmi_window(vcpu);
+			else if (vcpu->arch.irq_summary
+				 || kvm_run->request_interrupt_window)
+				enable_irq_window(vcpu);
+			return;
+		}
+		if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
+			enable_nmi_window(vcpu);
+	}
+
 	if (vcpu->arch.interrupt_window_open) {
 		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
 			kvm_do_inject_irq(vcpu);
@@ -2959,6 +2984,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 	++vcpu->stat.nmi_window_exits;
 
+	/*
+	 * If the user space waits to inject a NMI, exit as soon as possible
+	 */
+	if (kvm_run->request_nmi_window && !vcpu->arch.nmi_pending) {
+		kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
+		return 0;
+	}
+
 	return 1;
 }
 

From 3b86cd9967242f3f3d775ee015fb814a349ed5e6 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Fri, 26 Sep 2008 09:30:57 +0200
Subject: [PATCH 014/140] KVM: VMX: work around lacking VNMI support

Older VMX supporting CPUs do not provide the "Virtual NMI" feature for
tracking the NMI-blocked state after injecting such events. For now
KVM is unable to inject NMIs on those CPUs.

Derived from Sheng Yang's suggestion to use the IRQ window notification
for detecting the end of NMI handlers, this patch implements virtual
NMI support without impact on the host's ability to receive real NMIs.
The downside is that the given approach requires some heuristics that
can cause NMI nesting in vary rare corner cases.

The approach works as follows:
 - inject NMI and set a software-based NMI-blocked flag
 - arm the IRQ window start notification whenever an NMI window is
   requested
 - if the guest exits due to an opening IRQ window, clear the emulated
   NMI-blocked flag
 - if the guest net execution time with NMI-blocked but without an IRQ
   window exceeds 1 second, force NMI-blocked reset and inject anyway

This approach covers most practical scenarios:
 - succeeding NMIs are seperated by at least one open IRQ window
 - the guest may spin with IRQs disabled (e.g. due to a bug), but
   leaving the NMI handler takes much less time than one second
 - the guest does not rely on strict ordering or timing of NMIs
   (would be problematic in virtualized environments anyway)

Successfully tested with the 'nmi n' monitor command, the kgdbts
testsuite on smp guests (additional patches required to add debug
register support to kvm) + the kernel's nmi_watchdog=1, and a Siemens-
specific board emulation (+ guest) that comes with its own NMI
watchdog mechanism.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 176 +++++++++++++++++++++++++++++----------------
 1 file changed, 116 insertions(+), 60 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f16a62c79267..2180109d794c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -90,6 +90,11 @@ struct vcpu_vmx {
 	} rmode;
 	int vpid;
 	bool emulation_required;
+
+	/* Support for vnmi-less CPUs */
+	int soft_vnmi_blocked;
+	ktime_t entry_time;
+	s64 vnmi_blocked_time;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -2230,6 +2235,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 
 	vmx->vcpu.arch.rmode.active = 0;
 
+	vmx->soft_vnmi_blocked = 0;
+
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(&vmx->vcpu, 0);
 	msr = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
@@ -2335,6 +2342,29 @@ out:
 	return ret;
 }
 
+static void enable_irq_window(struct kvm_vcpu *vcpu)
+{
+	u32 cpu_based_vm_exec_control;
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
+{
+	u32 cpu_based_vm_exec_control;
+
+	if (!cpu_has_virtual_nmis()) {
+		enable_irq_window(vcpu);
+		return;
+	}
+
+	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
+	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
+	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+}
+
 static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2360,6 +2390,19 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
+	if (!cpu_has_virtual_nmis()) {
+		/*
+		 * Tracking the NMI-blocked state in software is built upon
+		 * finding the next open IRQ window. This, in turn, depends on
+		 * well-behaving guests: They have to keep IRQs disabled at
+		 * least as long as the NMI handler runs. Otherwise we may
+		 * cause NMI nesting, maybe breaking the guest. But as this is
+		 * highly unlikely, we can live with the residual risk.
+		 */
+		vmx->soft_vnmi_blocked = 1;
+		vmx->vnmi_blocked_time = 0;
+	}
+
 	++vcpu->stat.nmi_injections;
 	if (vcpu->arch.rmode.active) {
 		vmx->rmode.irq.pending = true;
@@ -2384,6 +2427,8 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 		!(guest_intr & (GUEST_INTR_STATE_STI |
 				GUEST_INTR_STATE_MOV_SS |
 				GUEST_INTR_STATE_NMI));
+	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
+		vcpu->arch.nmi_window_open = 0;
 
 	vcpu->arch.interrupt_window_open =
 		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
@@ -2403,55 +2448,31 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
 	kvm_queue_interrupt(vcpu, irq);
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
-static void enable_nmi_window(struct kvm_vcpu *vcpu)
-{
-	u32 cpu_based_vm_exec_control;
-
-	if (!cpu_has_virtual_nmis())
-		return;
-
-	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
-	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
-	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
-}
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
 	vmx_update_window_states(vcpu);
 
-	if (cpu_has_virtual_nmis()) {
-		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-			if (vcpu->arch.nmi_window_open) {
-				vcpu->arch.nmi_pending = false;
-				vcpu->arch.nmi_injected = true;
-			} else {
-				enable_nmi_window(vcpu);
-				return;
-			}
-		}
-		if (vcpu->arch.nmi_injected) {
-			vmx_inject_nmi(vcpu);
-			if (vcpu->arch.nmi_pending
-			    || kvm_run->request_nmi_window)
-				enable_nmi_window(vcpu);
-			else if (vcpu->arch.irq_summary
-				 || kvm_run->request_interrupt_window)
-				enable_irq_window(vcpu);
+	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+		if (vcpu->arch.nmi_window_open) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+		} else {
+			enable_nmi_window(vcpu);
 			return;
 		}
-		if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
-			enable_nmi_window(vcpu);
 	}
+	if (vcpu->arch.nmi_injected) {
+		vmx_inject_nmi(vcpu);
+		if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window)
+			enable_nmi_window(vcpu);
+		else if (vcpu->arch.irq_summary
+			 || kvm_run->request_interrupt_window)
+			enable_irq_window(vcpu);
+		return;
+	}
+	if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
+		enable_nmi_window(vcpu);
 
 	if (vcpu->arch.interrupt_window_open) {
 		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -3097,6 +3118,37 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		printk(KERN_WARNING "%s: unexpected, valid vectoring info "
 		       "(0x%x) and exit reason is 0x%x\n",
 		       __func__, vectoring_info, exit_reason);
+
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
+		if (vcpu->arch.interrupt_window_open) {
+			vmx->soft_vnmi_blocked = 0;
+			vcpu->arch.nmi_window_open = 1;
+		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
+		    (kvm_run->request_nmi_window || vcpu->arch.nmi_pending)) {
+			/*
+			 * This CPU don't support us in finding the end of an
+			 * NMI-blocked window if the guest runs with IRQs
+			 * disabled. So we pull the trigger after 1 s of
+			 * futile waiting, but inform the user about this.
+			 */
+			printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
+			       "state on VCPU %d after 1 s timeout\n",
+			       __func__, vcpu->vcpu_id);
+			vmx->soft_vnmi_blocked = 0;
+			vmx->vcpu.arch.nmi_window_open = 1;
+		}
+
+		/*
+		 * If the user space waits to inject an NNI, exit ASAP
+		 */
+		if (vcpu->arch.nmi_window_open && kvm_run->request_nmi_window
+		    && !vcpu->arch.nmi_pending) {
+			kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
+			++vcpu->stat.nmi_window_exits;
+			return 0;
+		}
+	}
+
 	if (exit_reason < kvm_vmx_max_exit_handlers
 	    && kvm_vmx_exit_handlers[exit_reason])
 		return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run);
@@ -3146,7 +3198,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		if (unblock_nmi && vector != DF_VECTOR)
 			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				      GUEST_INTR_STATE_NMI);
-	}
+	} else if (unlikely(vmx->soft_vnmi_blocked))
+		vmx->vnmi_blocked_time +=
+			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
 	idt_vectoring_info = vmx->idt_vectoring_info;
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
@@ -3186,27 +3240,25 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 
 	vmx_update_window_states(vcpu);
 
-	if (cpu_has_virtual_nmis()) {
-		if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-			if (vcpu->arch.interrupt.pending) {
-				enable_nmi_window(vcpu);
-			} else if (vcpu->arch.nmi_window_open) {
-				vcpu->arch.nmi_pending = false;
-				vcpu->arch.nmi_injected = true;
-			} else {
-				enable_nmi_window(vcpu);
-				return;
-			}
-		}
-		if (vcpu->arch.nmi_injected) {
-			vmx_inject_nmi(vcpu);
-			if (vcpu->arch.nmi_pending)
-				enable_nmi_window(vcpu);
-			else if (kvm_cpu_has_interrupt(vcpu))
-				enable_irq_window(vcpu);
+	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
+		if (vcpu->arch.interrupt.pending) {
+			enable_nmi_window(vcpu);
+		} else if (vcpu->arch.nmi_window_open) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+		} else {
+			enable_nmi_window(vcpu);
 			return;
 		}
 	}
+	if (vcpu->arch.nmi_injected) {
+		vmx_inject_nmi(vcpu);
+		if (vcpu->arch.nmi_pending)
+			enable_nmi_window(vcpu);
+		else if (kvm_cpu_has_interrupt(vcpu))
+			enable_irq_window(vcpu);
+		return;
+	}
 	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
 		if (vcpu->arch.interrupt_window_open)
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
@@ -3255,6 +3307,10 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u32 intr_info;
 
+	/* Record the guest's net vcpu time for enforced NMI injections. */
+	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
+		vmx->entry_time = ktime_get();
+
 	/* Handle invalid guest state instead of entering VMX */
 	if (vmx->emulation_required && emulate_invalid_guest_state) {
 		handle_invalid_guest_state(vcpu, kvm_run);

From 5f179287fa02723215eecf681d812b303c243973 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 7 Oct 2008 15:42:33 +0200
Subject: [PATCH 015/140] KVM: call kvm_arch_vcpu_reset() instead of the
 kvm_x86_ops callback

Call kvm_arch_vcpu_reset() instead of directly using arch callback.
The function does additional things.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 07971451b947..a2c4b5594555 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3010,7 +3010,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		pr_debug("vcpu %d received sipi with vector # %x\n",
 			 vcpu->vcpu_id, vcpu->arch.sipi_vector);
 		kvm_lapic_reset(vcpu);
-		r = kvm_x86_ops->vcpu_reset(vcpu);
+		r = kvm_arch_vcpu_reset(vcpu);
 		if (r)
 			return r;
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;

From b558bc0a25c82ef2a9d2683b0beb3e4b87cea20b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 9 Oct 2008 16:01:52 +0800
Subject: [PATCH 016/140] x86: Rename mtrr_state struct and macro names

Prepare for exporting them.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++----
 arch/x86/kernel/cpu/mtrr/main.c    | 4 ++--
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 7 ++++---
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 4e8d77f01eeb..90db91e15931 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,9 +14,9 @@
 #include <asm/pat.h>
 #include "mtrr.h"
 
-struct mtrr_state {
-	struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
-	mtrr_type fixed_ranges[NUM_FIXED_RANGES];
+struct mtrr_state_type {
+	struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+	mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
 	unsigned char enabled;
 	unsigned char have_fixed;
 	mtrr_type def_type;
@@ -35,7 +35,7 @@ static struct fixed_range_block fixed_range_blocks[] = {
 };
 
 static unsigned long smp_changes_mask;
-static struct mtrr_state mtrr_state = {};
+static struct mtrr_state_type mtrr_state = {};
 static int mtrr_state_set;
 u64 mtrr_tom2;
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 1159e269e596..d6ec7ec30274 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -49,7 +49,7 @@
 
 u32 num_var_ranges = 0;
 
-unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 static DEFINE_MUTEX(mtrr_mutex);
 
 u64 size_or_mask, size_and_mask;
@@ -574,7 +574,7 @@ struct mtrr_value {
 	unsigned long	lsize;
 };
 
-static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
+static struct mtrr_value mtrr_state[MTRR_MAX_VAR_RANGES];
 
 static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
 {
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 2dc4ec656b23..988538202ddc 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -11,8 +11,9 @@
 #define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
 #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
 
-#define NUM_FIXED_RANGES 88
-#define MAX_VAR_RANGES 256
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
@@ -33,7 +34,7 @@
    an 8 bit field: */
 typedef u8 mtrr_type;
 
-extern unsigned int mtrr_usage_table[MAX_VAR_RANGES];
+extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 
 struct mtrr_ops {
 	u32	vendor;

From 932d27a7913fc6b3c64c6e6082628b0a1561dec9 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 9 Oct 2008 16:01:53 +0800
Subject: [PATCH 017/140] x86: Export some definition of MTRR

For KVM can reuse the type define, and need them to support shadow MTRR.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/mtrr.h        | 25 +++++++++++++++++++++++++
 arch/x86/kernel/cpu/mtrr/generic.c | 12 +++---------
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 17 -----------------
 3 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
index 7c1e4258b31e..cb988aab716d 100644
--- a/arch/x86/include/asm/mtrr.h
+++ b/arch/x86/include/asm/mtrr.h
@@ -57,6 +57,31 @@ struct mtrr_gentry {
 };
 #endif /* !__i386__ */
 
+struct mtrr_var_range {
+	u32 base_lo;
+	u32 base_hi;
+	u32 mask_lo;
+	u32 mask_hi;
+};
+
+/* In the Intel processor's MTRR interface, the MTRR type is always held in
+   an 8 bit field: */
+typedef u8 mtrr_type;
+
+#define MTRR_NUM_FIXED_RANGES 88
+#define MTRR_MAX_VAR_RANGES 256
+
+struct mtrr_state_type {
+	struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
+	mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
+	unsigned char enabled;
+	unsigned char have_fixed;
+	mtrr_type def_type;
+};
+
+#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
+#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+
 /*  These are the various ioctls  */
 #define MTRRIOC_ADD_ENTRY        _IOW(MTRR_IOCTL_BASE,  0, struct mtrr_sentry)
 #define MTRRIOC_SET_ENTRY        _IOW(MTRR_IOCTL_BASE,  1, struct mtrr_sentry)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 90db91e15931..b59ddcc88cd8 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -14,14 +14,6 @@
 #include <asm/pat.h>
 #include "mtrr.h"
 
-struct mtrr_state_type {
-	struct mtrr_var_range var_ranges[MTRR_MAX_VAR_RANGES];
-	mtrr_type fixed_ranges[MTRR_NUM_FIXED_RANGES];
-	unsigned char enabled;
-	unsigned char have_fixed;
-	mtrr_type def_type;
-};
-
 struct fixed_range_block {
 	int base_msr; /* start address of an MTRR block */
 	int ranges;   /* number of MTRRs in this block  */
@@ -35,10 +27,12 @@ static struct fixed_range_block fixed_range_blocks[] = {
 };
 
 static unsigned long smp_changes_mask;
-static struct mtrr_state_type mtrr_state = {};
 static int mtrr_state_set;
 u64 mtrr_tom2;
 
+struct mtrr_state_type mtrr_state = {};
+EXPORT_SYMBOL_GPL(mtrr_state);
+
 #undef MODULE_PARAM_PREFIX
 #define MODULE_PARAM_PREFIX "mtrr."
 
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 988538202ddc..ffd60409cc6d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -8,12 +8,6 @@
 #define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
-
-#define MTRR_NUM_FIXED_RANGES 88
-#define MTRR_MAX_VAR_RANGES 256
-
 #define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
@@ -30,10 +24,6 @@
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
 
-/* In the Intel processor's MTRR interface, the MTRR type is always held in
-   an 8 bit field: */
-typedef u8 mtrr_type;
-
 extern unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES];
 
 struct mtrr_ops {
@@ -71,13 +61,6 @@ struct set_mtrr_context {
 	u32 ccr3;
 };
 
-struct mtrr_var_range {
-	u32 base_lo;
-	u32 base_hi;
-	u32 mask_lo;
-	u32 mask_hi;
-};
-
 void set_mtrr_done(struct set_mtrr_context *ctxt);
 void set_mtrr_cache_disable(struct set_mtrr_context *ctxt);
 void set_mtrr_prepare_save(struct set_mtrr_context *ctxt);

From 0bed3b568b68e5835ef5da888a372b9beabf7544 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 9 Oct 2008 16:01:54 +0800
Subject: [PATCH 018/140] KVM: Improve MTRR structure

As well as reset mmu context when set MTRR.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++-
 arch/x86/kvm/x86.c              | 61 +++++++++++++++++++++++++++++++--
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a40fa8478920..8082e87f628d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -21,6 +21,7 @@
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 
 #define KVM_MAX_VCPUS 16
 #define KVM_MEMORY_SLOTS 32
@@ -86,6 +87,7 @@
 #define KVM_MIN_FREE_MMU_PAGES 5
 #define KVM_REFILL_PAGES 25
 #define KVM_MAX_CPUID_ENTRIES 40
+#define KVM_NR_FIXED_MTRR_REGION 88
 #define KVM_NR_VAR_MTRR 8
 
 extern spinlock_t kvm_lock;
@@ -329,7 +331,8 @@ struct kvm_vcpu_arch {
 	bool nmi_injected;
 	bool nmi_window_open;
 
-	u64 mtrr[0x100];
+	struct mtrr_state_type mtrr_state;
+	u32 pat;
 };
 
 struct kvm_mem_alias {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a2c4b5594555..f5b2334c6bda 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -39,6 +39,7 @@
 #include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
+#include <asm/mtrr.h>
 
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
@@ -650,10 +651,38 @@ static bool msr_mtrr_valid(unsigned msr)
 
 static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 {
+	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
 	if (!msr_mtrr_valid(msr))
 		return 1;
 
-	vcpu->arch.mtrr[msr - 0x200] = data;
+	if (msr == MSR_MTRRdefType) {
+		vcpu->arch.mtrr_state.def_type = data;
+		vcpu->arch.mtrr_state.enabled = (data & 0xc00) >> 10;
+	} else if (msr == MSR_MTRRfix64K_00000)
+		p[0] = data;
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		p[1 + msr - MSR_MTRRfix16K_80000] = data;
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		p[3 + msr - MSR_MTRRfix4K_C0000] = data;
+	else if (msr == MSR_IA32_CR_PAT)
+		vcpu->arch.pat = data;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		u64 *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask)
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+		else
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+		*pt = data;
+	}
+
+	kvm_mmu_reset_context(vcpu);
 	return 0;
 }
 
@@ -749,10 +778,37 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 
 static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 {
+	u64 *p = (u64 *)&vcpu->arch.mtrr_state.fixed_ranges;
+
 	if (!msr_mtrr_valid(msr))
 		return 1;
 
-	*pdata = vcpu->arch.mtrr[msr - 0x200];
+	if (msr == MSR_MTRRdefType)
+		*pdata = vcpu->arch.mtrr_state.def_type +
+			 (vcpu->arch.mtrr_state.enabled << 10);
+	else if (msr == MSR_MTRRfix64K_00000)
+		*pdata = p[0];
+	else if (msr == MSR_MTRRfix16K_80000 || msr == MSR_MTRRfix16K_A0000)
+		*pdata = p[1 + msr - MSR_MTRRfix16K_80000];
+	else if (msr >= MSR_MTRRfix4K_C0000 && msr <= MSR_MTRRfix4K_F8000)
+		*pdata = p[3 + msr - MSR_MTRRfix4K_C0000];
+	else if (msr == MSR_IA32_CR_PAT)
+		*pdata = vcpu->arch.pat;
+	else {	/* Variable MTRRs */
+		int idx, is_mtrr_mask;
+		u64 *pt;
+
+		idx = (msr - 0x200) / 2;
+		is_mtrr_mask = msr - 0x200 - 2 * idx;
+		if (!is_mtrr_mask)
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].base_lo;
+		else
+			pt =
+			  (u64 *)&vcpu->arch.mtrr_state.var_ranges[idx].mask_lo;
+		*pdata = *pt;
+	}
+
 	return 0;
 }
 
@@ -3942,6 +3998,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	/* We do fxsave: this must be aligned. */
 	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
 
+	vcpu->arch.mtrr_state.have_fixed = 1;
 	vcpu_load(vcpu);
 	r = kvm_arch_vcpu_reset(vcpu);
 	if (r == 0)

From 468d472f3f65100d5fb88c8d45043c85b874c294 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 9 Oct 2008 16:01:55 +0800
Subject: [PATCH 019/140] KVM: VMX: Add PAT support for EPT

GUEST_PAT support is a new feature introduced by Intel Core i7 architecture.
With this, cpu would save/load guest and host PAT automatically, for EPT memory
type in guest depends on MSR_IA32_CR_PAT.

Also add save/restore for MSR_IA32_CR_PAT.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 29 ++++++++++++++++++++++++++---
 arch/x86/kvm/vmx.h |  7 +++++++
 arch/x86/kvm/x86.c |  2 +-
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2180109d794c..b4c95a501cca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -962,6 +962,13 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
 		pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
 
 		break;
+	case MSR_IA32_CR_PAT:
+		if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+			vmcs_write64(GUEST_IA32_PAT, data);
+			vcpu->arch.pat = data;
+			break;
+		}
+		/* Otherwise falls through to kvm_set_msr_common */
 	default:
 		vmx_load_host_state(vmx);
 		msr = find_msr_entry(vmx, msr_index);
@@ -1181,12 +1188,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
 #endif
-	opt = 0;
+	opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
 				&_vmexit_control) < 0)
 		return -EIO;
 
-	min = opt = 0;
+	min = 0;
+	opt = VM_ENTRY_LOAD_IA32_PAT;
 	if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
 				&_vmentry_control) < 0)
 		return -EIO;
@@ -2092,8 +2100,9 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
  */
 static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 {
-	u32 host_sysenter_cs;
+	u32 host_sysenter_cs, msr_low, msr_high;
 	u32 junk;
+	u64 host_pat;
 	unsigned long a;
 	struct descriptor_table dt;
 	int i;
@@ -2181,6 +2190,20 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	rdmsrl(MSR_IA32_SYSENTER_EIP, a);
 	vmcs_writel(HOST_IA32_SYSENTER_EIP, a);   /* 22.2.3 */
 
+	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+		host_pat = msr_low | ((u64) msr_high << 32);
+		vmcs_write64(HOST_IA32_PAT, host_pat);
+	}
+	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
+		rdmsr(MSR_IA32_CR_PAT, msr_low, msr_high);
+		host_pat = msr_low | ((u64) msr_high << 32);
+		/* Write the default value follow host pat */
+		vmcs_write64(GUEST_IA32_PAT, host_pat);
+		/* Keep arch.pat sync with GUEST_IA32_PAT */
+		vmx->vcpu.arch.pat = host_pat;
+	}
+
 	for (i = 0; i < NR_VMX_MSR; ++i) {
 		u32 index = vmx_msr_index[i];
 		u32 data_low, data_high;
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index ec5edc339da6..18598afe52eb 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -63,10 +63,13 @@
 
 #define VM_EXIT_HOST_ADDR_SPACE_SIZE            0x00000200
 #define VM_EXIT_ACK_INTR_ON_EXIT                0x00008000
+#define VM_EXIT_SAVE_IA32_PAT			0x00040000
+#define VM_EXIT_LOAD_IA32_PAT			0x00080000
 
 #define VM_ENTRY_IA32E_MODE                     0x00000200
 #define VM_ENTRY_SMM                            0x00000400
 #define VM_ENTRY_DEACT_DUAL_MONITOR             0x00000800
+#define VM_ENTRY_LOAD_IA32_PAT			0x00004000
 
 /* VMCS Encodings */
 enum vmcs_field {
@@ -112,6 +115,8 @@ enum vmcs_field {
 	VMCS_LINK_POINTER_HIGH          = 0x00002801,
 	GUEST_IA32_DEBUGCTL             = 0x00002802,
 	GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+	GUEST_IA32_PAT			= 0x00002804,
+	GUEST_IA32_PAT_HIGH		= 0x00002805,
 	GUEST_PDPTR0                    = 0x0000280a,
 	GUEST_PDPTR0_HIGH               = 0x0000280b,
 	GUEST_PDPTR1                    = 0x0000280c,
@@ -120,6 +125,8 @@ enum vmcs_field {
 	GUEST_PDPTR2_HIGH               = 0x0000280f,
 	GUEST_PDPTR3                    = 0x00002810,
 	GUEST_PDPTR3_HIGH               = 0x00002811,
+	HOST_IA32_PAT			= 0x00002c00,
+	HOST_IA32_PAT_HIGH		= 0x00002c01,
 	PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
 	CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
 	EXCEPTION_BITMAP                = 0x00004004,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f5b2334c6bda..0edf75339f3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -452,7 +452,7 @@ static u32 msrs_to_save[] = {
 	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
 #endif
 	MSR_IA32_TIME_STAMP_COUNTER, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
-	MSR_IA32_PERF_STATUS,
+	MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT
 };
 
 static unsigned num_msrs_to_save;

From 74be52e3e6285fc6e872a2a7baea544106f399ea Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 9 Oct 2008 16:01:56 +0800
Subject: [PATCH 020/140] KVM: Add local get_mtrr_type() to support MTRR

For EPT memory type support.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 104 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 410ddbc1aa2e..ac2304fd173e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1393,6 +1393,110 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
 	return page;
 }
 
+/*
+ * The function is based on mtrr_type_lookup() in
+ * arch/x86/kernel/cpu/mtrr/generic.c
+ */
+static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
+			 u64 start, u64 end)
+{
+	int i;
+	u64 base, mask;
+	u8 prev_match, curr_match;
+	int num_var_ranges = KVM_NR_VAR_MTRR;
+
+	if (!mtrr_state->enabled)
+		return 0xFF;
+
+	/* Make end inclusive end, instead of exclusive */
+	end--;
+
+	/* Look in fixed ranges. Just return the type as per start */
+	if (mtrr_state->have_fixed && (start < 0x100000)) {
+		int idx;
+
+		if (start < 0x80000) {
+			idx = 0;
+			idx += (start >> 16);
+			return mtrr_state->fixed_ranges[idx];
+		} else if (start < 0xC0000) {
+			idx = 1 * 8;
+			idx += ((start - 0x80000) >> 14);
+			return mtrr_state->fixed_ranges[idx];
+		} else if (start < 0x1000000) {
+			idx = 3 * 8;
+			idx += ((start - 0xC0000) >> 12);
+			return mtrr_state->fixed_ranges[idx];
+		}
+	}
+
+	/*
+	 * Look in variable ranges
+	 * Look of multiple ranges matching this address and pick type
+	 * as per MTRR precedence
+	 */
+	if (!(mtrr_state->enabled & 2))
+		return mtrr_state->def_type;
+
+	prev_match = 0xFF;
+	for (i = 0; i < num_var_ranges; ++i) {
+		unsigned short start_state, end_state;
+
+		if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11)))
+			continue;
+
+		base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) +
+		       (mtrr_state->var_ranges[i].base_lo & PAGE_MASK);
+		mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) +
+		       (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK);
+
+		start_state = ((start & mask) == (base & mask));
+		end_state = ((end & mask) == (base & mask));
+		if (start_state != end_state)
+			return 0xFE;
+
+		if ((start & mask) != (base & mask))
+			continue;
+
+		curr_match = mtrr_state->var_ranges[i].base_lo & 0xff;
+		if (prev_match == 0xFF) {
+			prev_match = curr_match;
+			continue;
+		}
+
+		if (prev_match == MTRR_TYPE_UNCACHABLE ||
+		    curr_match == MTRR_TYPE_UNCACHABLE)
+			return MTRR_TYPE_UNCACHABLE;
+
+		if ((prev_match == MTRR_TYPE_WRBACK &&
+		     curr_match == MTRR_TYPE_WRTHROUGH) ||
+		    (prev_match == MTRR_TYPE_WRTHROUGH &&
+		     curr_match == MTRR_TYPE_WRBACK)) {
+			prev_match = MTRR_TYPE_WRTHROUGH;
+			curr_match = MTRR_TYPE_WRTHROUGH;
+		}
+
+		if (prev_match != curr_match)
+			return MTRR_TYPE_UNCACHABLE;
+	}
+
+	if (prev_match != 0xFF)
+		return prev_match;
+
+	return mtrr_state->def_type;
+}
+
+static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+	u8 mtrr;
+
+	mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT,
+			     (gfn << PAGE_SHIFT) + PAGE_SIZE);
+	if (mtrr == 0xfe || mtrr == 0xff)
+		mtrr = MTRR_TYPE_WRBACK;
+	return mtrr;
+}
+
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 	unsigned index;

From 64d4d521757117aa5c1cfe79d3baa6cf57703f81 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 9 Oct 2008 16:01:57 +0800
Subject: [PATCH 021/140] KVM: Enable MTRR for EPT

The effective memory type of EPT is the mixture of MSR_IA32_CR_PAT and memory
type field of EPT entry.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/mmu.c              | 11 ++++++++++-
 arch/x86/kvm/svm.c              |  6 ++++++
 arch/x86/kvm/vmx.c              | 10 ++++++++--
 arch/x86/kvm/x86.c              |  2 +-
 5 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8082e87f628d..93040b5eed96 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -483,6 +483,7 @@ struct kvm_x86_ops {
 
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
+	int (*get_mt_mask_shift)(void);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -496,7 +497,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ac2304fd173e..09d05f57bf66 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -168,6 +168,7 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mt_mask;
 
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
@@ -183,13 +184,14 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
+	shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -1546,6 +1548,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 {
 	u64 spte;
 	int ret = 0;
+	u64 mt_mask = shadow_mt_mask;
+
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
 	 * whether the guest actually used the pte (in order to detect
@@ -1564,6 +1568,11 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
+	if (mt_mask) {
+		mt_mask = get_memory_type(vcpu, gfn) <<
+			  kvm_x86_ops->get_mt_mask_shift();
+		spte |= mt_mask;
+	}
 
 	spte |= (u64)pfn << PAGE_SHIFT;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 9c4ce657d963..05efc4ef75a6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1912,6 +1912,11 @@ static int get_npt_level(void)
 #endif
 }
 
+static int svm_get_mt_mask_shift(void)
+{
+	return 0;
+}
+
 static struct kvm_x86_ops svm_x86_ops = {
 	.cpu_has_kvm_support = has_svm,
 	.disabled_by_bios = is_disabled,
@@ -1967,6 +1972,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
+	.get_mt_mask_shift = svm_get_mt_mask_shift,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b4c95a501cca..dae134fa09e7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3574,6 +3574,11 @@ static int get_ept_level(void)
 	return VMX_EPT_DEFAULT_GAW + 1;
 }
 
+static int vmx_get_mt_mask_shift(void)
+{
+	return VMX_EPT_MT_EPTE_SHIFT;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -3629,6 +3634,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
+	.get_mt_mask_shift = vmx_get_mt_mask_shift,
 };
 
 static int __init vmx_init(void)
@@ -3685,10 +3691,10 @@ static int __init vmx_init(void)
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
 			VMX_EPT_WRITABLE_MASK |
-			VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT |
 			VMX_EPT_IGMT_BIT);
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-				VMX_EPT_EXECUTABLE_MASK);
+				VMX_EPT_EXECUTABLE_MASK,
+				VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0edf75339f3a..f175b796c2a6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2615,7 +2615,7 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-			PT_DIRTY_MASK, PT64_NX_MASK, 0);
+			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
 	return 0;
 
 out:

From d73fa29a9b75b2af7f69dae276d2c602a23b329b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 14 Oct 2008 15:59:10 +0800
Subject: [PATCH 022/140] KVM: Clean up kvm_x86_emulate.h

Remove one left improper comment of removed CR2.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_x86_emulate.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 25179a29f208..16a002655f31 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -146,22 +146,18 @@ struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
 
-	/* Linear faulting address (if emulating a page-faulting instruction) */
 	unsigned long eflags;
-
 	/* Emulated execution mode, represented by an X86EMUL_MODE value. */
 	int mode;
-
 	u32 cs_base;
 
 	/* decode cache */
-
 	struct decode_cache decode;
 };
 
 /* Repeat String Operation Prefix */
-#define REPE_PREFIX  1
-#define REPNE_PREFIX    2
+#define REPE_PREFIX	1
+#define REPNE_PREFIX	2
 
 /* Execution mode, passed to the emulator. */
 #define X86EMUL_MODE_REAL     0	/* Real mode.             */
@@ -170,7 +166,7 @@ struct x86_emulate_ctxt {
 #define X86EMUL_MODE_PROT64   8	/* 64-bit (long) mode.    */
 
 /* Host execution mode. */
-#if defined(__i386__)
+#if defined(CONFIG_X86_32)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
 #elif defined(CONFIG_X86_64)
 #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64

From 291f26bc0f89518ad7ee3207c09eb8a743ac8fcc Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 16 Oct 2008 17:30:57 +0800
Subject: [PATCH 023/140] KVM: MMU: Extend kvm_mmu_page->slot_bitmap size

Otherwise set_bit() for private memory slot(above KVM_MEMORY_SLOTS) would
corrupted memory in 32bit host.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 8 +++++---
 arch/x86/kvm/mmu.c              | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 93040b5eed96..59c3ae10de6c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -192,9 +192,11 @@ struct kvm_mmu_page {
 	u64 *spt;
 	/* hold the gfn of each spte inside spt */
 	gfn_t *gfns;
-	unsigned long slot_bitmap; /* One bit set per slot which has memory
-				    * in this shadow page.
-				    */
+	/*
+	 * One bit set per slot which has memory
+	 * in this shadow page.
+	 */
+	DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 09d05f57bf66..8687758b5295 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -789,7 +789,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	ASSERT(is_empty_shadow_page(sp->spt));
-	sp->slot_bitmap = 0;
+	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
@@ -1364,7 +1364,7 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 	int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn));
 	struct kvm_mmu_page *sp = page_header(__pa(pte));
 
-	__set_bit(slot, &sp->slot_bitmap);
+	__set_bit(slot, sp->slot_bitmap);
 }
 
 static void mmu_convert_notrap(struct kvm_mmu_page *sp)
@@ -2564,7 +2564,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		int i;
 		u64 *pt;
 
-		if (!test_bit(slot, &sp->slot_bitmap))
+		if (!test_bit(slot, sp->slot_bitmap))
 			continue;
 
 		pt = sp->spt;

From 6fe639792c7b8e462baeaac39ecc33541fd5da6e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 16 Oct 2008 17:30:58 +0800
Subject: [PATCH 024/140] KVM: VMX: Move private memory slot position

PCI device assignment would map guest MMIO spaces as separate slot, so it is
possible that the device has more than 2 MMIO spaces and overwrite current
private memslot.

The patch move private memory slot to the top of userspace visible memory slots.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 arch/x86/kvm/vmx.h | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dae134fa09e7..7623eb7b68d5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2513,7 +2513,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	int ret;
 	struct kvm_userspace_memory_region tss_mem = {
-		.slot = 8,
+		.slot = TSS_PRIVATE_MEMSLOT,
 		.guest_phys_addr = addr,
 		.memory_size = PAGE_SIZE * 3,
 		.flags = 0,
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 18598afe52eb..3db236c26fa4 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -338,8 +338,9 @@ enum vmcs_field {
 
 #define AR_RESERVD_MASK 0xfffe0f00
 
-#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	9
-#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	10
+#define TSS_PRIVATE_MEMSLOT			(KVM_MEMORY_SLOTS + 0)
+#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT	(KVM_MEMORY_SLOTS + 1)
+#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	(KVM_MEMORY_SLOTS + 2)
 
 #define VMX_NR_VPIDS				(1 << 16)
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1

From 291fd39bfc2089c2dae79cf2d7cfca81b14ca769 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Mon, 20 Oct 2008 13:11:58 +0200
Subject: [PATCH 025/140] KVM: x86 emulator: Add decode entries for 0x04 and
 0x05 opcodes (add acc, imm)

Add decode entries for 0x04 and 0x05 (ADD) opcodes, execution is already
implemented.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index a391e213fe61..57d7cc45be44 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -80,7 +80,7 @@ static u16 opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0,
 	/* 0x08 - 0x0F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,

From 8fdb2351d51b040146f10a624387bbd102d851c0 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 20 Oct 2008 10:20:02 +0200
Subject: [PATCH 026/140] KVM: x86: Fix and refactor NMI watchdog emulation

This patch refactors the NMI watchdog delivery patch, consolidating
tests and providing a proper API for delivering watchdog events.

An included micro-optimization is to check only for apic_hw_enabled in
kvm_apic_local_deliver (the test for LVT mask is covering the
soft-disabled case already).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 15 +++++++++------
 arch/x86/kvm/irq.h   |  2 +-
 arch/x86/kvm/lapic.c | 20 ++++++++++----------
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 580cc1d01c7d..b6fcf5a9e502 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -612,15 +612,18 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
 	mutex_unlock(&kvm->lock);
 
 	/*
-	 * Provides NMI watchdog support in IOAPIC mode.
-	 * The route is: PIT -> PIC -> LVT0 in NMI mode,
-	 * timer IRQs will continue to flow through the IOAPIC.
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation is simplified, only
+	 * propagating PIT interrupts to all VCPUs when they have set
+	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+	 * VCPU0, and only if its LVT0 is in EXTINT mode.
 	 */
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		vcpu = kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-		kvm_apic_local_deliver(vcpu, APIC_LVT0);
+		if (vcpu)
+			kvm_apic_nmi_wd_deliver(vcpu);
 	}
 }
 
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 71e37a530cf7..b9e9051650ea 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -87,7 +87,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu);
 void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu);
-int kvm_apic_local_deliver(struct kvm_vcpu *vcpu, int lvt_type);
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu);
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
 void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 304f9ddbdd51..0b0d413f0af5 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -973,14 +973,12 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-int kvm_apic_local_deliver(struct kvm_vcpu *vcpu, int lvt_type)
+static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
+	u32 reg = apic_get_reg(apic, lvt_type);
 	int vector, mode, trig_mode;
-	u32 reg;
 
-	if (apic && apic_enabled(apic)) {
-		reg = apic_get_reg(apic, lvt_type);
+	if (apic_hw_enabled(apic) && !(reg & APIC_LVT_MASKED)) {
 		vector = reg & APIC_VECTOR_MASK;
 		mode = reg & APIC_MODE_MASK;
 		trig_mode = reg & APIC_LVT_LEVEL_TRIGGER;
@@ -989,9 +987,12 @@ int kvm_apic_local_deliver(struct kvm_vcpu *vcpu, int lvt_type)
 	return 0;
 }
 
-static inline int __inject_apic_timer_irq(struct kvm_lapic *apic)
+void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 {
-	return kvm_apic_local_deliver(apic->vcpu, APIC_LVTT);
+	struct kvm_lapic *apic = vcpu->arch.apic;
+
+	if (apic)
+		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
 static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
@@ -1086,9 +1087,8 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic && apic_lvt_enabled(apic, APIC_LVTT) &&
-		atomic_read(&apic->timer.pending) > 0) {
-		if (__inject_apic_timer_irq(apic))
+	if (apic && atomic_read(&apic->timer.pending) > 0) {
+		if (kvm_apic_local_deliver(apic, APIC_LVTT))
 			atomic_dec(&apic->timer.pending);
 	}
 }

From cc6e462cd54e64858ea25816df87d033229efe56 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 20 Oct 2008 10:20:03 +0200
Subject: [PATCH 027/140] KVM: x86: Optimize NMI watchdog delivery

As suggested by Avi, this patch introduces a counter of VCPUs that have
LVT0 set to NMI mode. Only if the counter > 0, we push the PIT ticks via
all LAPIC LVT0 lines to enable NMI watchdog support.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Acked-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/i8254.c            | 11 ++++++-----
 arch/x86/kvm/lapic.c            | 23 ++++++++++++++++++++---
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 59c3ae10de6c..09e6c56572cb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -361,6 +361,7 @@ struct kvm_arch{
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
 	struct hlist_head irq_ack_notifier_list;
+	int vapics_in_nmi_mode;
 
 	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index b6fcf5a9e502..e665d1c623ca 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -620,11 +620,12 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
 	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
 	 * VCPU0, and only if its LVT0 is in EXTINT mode.
 	 */
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		vcpu = kvm->vcpus[i];
-		if (vcpu)
-			kvm_apic_nmi_wd_deliver(vcpu);
-	}
+	if (kvm->arch.vapics_in_nmi_mode > 0)
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (vcpu)
+				kvm_apic_nmi_wd_deliver(vcpu);
+		}
 }
 
 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 0b0d413f0af5..afac68c0815c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -130,6 +130,11 @@ static inline int apic_lvtt_period(struct kvm_lapic *apic)
 	return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC;
 }
 
+static inline int apic_lvt_nmi_mode(u32 lvt_val)
+{
+	return (lvt_val & (APIC_MODE_MASK | APIC_LVT_MASKED)) == APIC_DM_NMI;
+}
+
 static unsigned int apic_lvt_mask[APIC_LVT_NUM] = {
 	LVT_MASK | APIC_LVT_TIMER_PERIODIC,	/* LVTT */
 	LVT_MASK | APIC_MODE_MASK,	/* LVTTHMR */
@@ -672,6 +677,20 @@ static void start_apic_timer(struct kvm_lapic *apic)
 					apic->timer.period)));
 }
 
+static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
+{
+	int nmi_wd_enabled = apic_lvt_nmi_mode(apic_get_reg(apic, APIC_LVT0));
+
+	if (apic_lvt_nmi_mode(lvt0_val)) {
+		if (!nmi_wd_enabled) {
+			apic_debug("Receive NMI setting on APIC_LVT0 "
+				   "for cpu %d\n", apic->vcpu->vcpu_id);
+			apic->vcpu->kvm->arch.vapics_in_nmi_mode++;
+		}
+	} else if (nmi_wd_enabled)
+		apic->vcpu->kvm->arch.vapics_in_nmi_mode--;
+}
+
 static void apic_mmio_write(struct kvm_io_device *this,
 			    gpa_t address, int len, const void *data)
 {
@@ -753,9 +772,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		break;
 
 	case APIC_LVT0:
-		if (val == APIC_DM_NMI)
-			apic_debug("Receive NMI setting on APIC_LVT0 "
-				"for cpu %d\n", apic->vcpu->vcpu_id);
+		apic_manage_nmi_watchdog(apic, val);
 	case APIC_LVTT:
 	case APIC_LVTTHMR:
 	case APIC_LVTPC:

From e19e30effac03f5a005a8e42ed941a2a5dc62654 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 20 Oct 2008 16:07:10 +0800
Subject: [PATCH 028/140] KVM: IRQ ACK notifier should be used with in-kernel
 irqchip

Also remove unnecessary parameter of unregister irq ack notifier.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 3 +--
 virt/kvm/irq_comm.c      | 8 ++++++--
 virt/kvm/kvm_main.c      | 2 +-
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bb92be2153bc..3a0fb77d1f6a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -316,8 +316,7 @@ void kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian);
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 55ad76ee2d09..9fbbdea3d1d5 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -58,12 +58,16 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian)
 {
+	/* Must be called with in-kernel IRQ chip, otherwise it's nonsense */
+	ASSERT(irqchip_in_kernel(kvm));
+	ASSERT(kian);
 	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
 }
 
-void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
-				     struct kvm_irq_ack_notifier *kian)
+void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian)
 {
+	if (!kian)
+		return;
 	hlist_del(&kian->link);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a87f45edfae8..4f43abe198e4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -143,7 +143,7 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
-	kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier);
+	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
 	kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 
 	if (cancel_work_sync(&assigned_dev->interrupt_work))

From b8222ad2e52fd2c0c4e5e1c53e65d131f911b767 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Wed, 22 Oct 2008 16:39:47 +0530
Subject: [PATCH 029/140] KVM: x86: Fix typo in function name

get_segment_descritptor_dtable() contains an obvious type.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f175b796c2a6..ceeac8897143 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3373,9 +3373,9 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
 	kvm_desct->padding = 0;
 }
 
-static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
-					   u16 selector,
-					   struct descriptor_table *dtable)
+static void get_segment_descriptor_dtable(struct kvm_vcpu *vcpu,
+					  u16 selector,
+					  struct descriptor_table *dtable)
 {
 	if (selector & 1 << 2) {
 		struct kvm_segment kvm_seg;
@@ -3400,7 +3400,7 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 	struct descriptor_table dtable;
 	u16 index = selector >> 3;
 
-	get_segment_descritptor_dtable(vcpu, selector, &dtable);
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
 	if (dtable.limit < index * 8 + 7) {
 		kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
@@ -3419,7 +3419,7 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
 	struct descriptor_table dtable;
 	u16 index = selector >> 3;
 
-	get_segment_descritptor_dtable(vcpu, selector, &dtable);
+	get_segment_descriptor_dtable(vcpu, selector, &dtable);
 
 	if (dtable.limit < index * 8 + 7)
 		return 1;

From 25022acc3dd5f0b54071c7ba7c371860f2971b52 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Mon, 27 Oct 2008 09:04:17 +0000
Subject: [PATCH 030/140] KVM: SVM: Set the 'g' bit of the cs selector for
 cross-vendor migration

The hardware does not set the 'g' bit of the cs selector and this breaks
migration from amd hosts to intel hosts. Set this bit if the segment
limit is beyond 1 MB.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 05efc4ef75a6..665008d97856 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -772,6 +772,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1;
 	var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1;
 	var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1;
+
+	/*
+	 * SVM always stores 0 for the 'G' bit in the CS selector in
+	 * the VMCB on a VMEXIT. This hurts cross-vendor migration:
+	 * Intel's VMENTRY has a check on the 'G' bit.
+	 */
+	if (seg == VCPU_SREG_CS)
+		var->g = s->limit > 0xfffff;
+
 	var->unusable = !var->present;
 }
 

From c0d09828c870f90c6bc72070ada281568f89c63b Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Mon, 27 Oct 2008 09:04:18 +0000
Subject: [PATCH 031/140] KVM: SVM: Set the 'busy' flag of the TR selector

The busy flag of the TR selector is not set by the hardware. This breaks
migration from amd hosts to intel hosts.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 665008d97856..743aebd7bfcc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -781,6 +781,13 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 	if (seg == VCPU_SREG_CS)
 		var->g = s->limit > 0xfffff;
 
+	/*
+	 * Work around a bug where the busy flag in the tr selector
+	 * isn't exposed
+	 */
+	if (seg == VCPU_SREG_TR)
+		var->type |= 0x2;
+
 	var->unusable = !var->present;
 }
 

From e93f36bcfaa9e899c595e1c446c784a69021854a Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Tue, 28 Oct 2008 10:51:30 +0100
Subject: [PATCH 032/140] KVM: allow emulator to adjust rip for emulated pio
 instructions

If we call the emulator we shouldn't call skip_emulated_instruction()
in the first place, since the emulator already computes the next rip
for us. Thus we move ->skip_emulated_instruction() out of
kvm_emulate_pio() and into handle_io() (and the svm equivalent). We
also replaced "return 0" by "break" in the "do_io:" case because now
the shadow register state needs to be committed. Otherwise eip will never
be updated.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c         | 1 +
 arch/x86/kvm/vmx.c         | 1 +
 arch/x86/kvm/x86.c         | 2 --
 arch/x86/kvm/x86_emulate.c | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 743aebd7bfcc..f0ad4d4217e4 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1115,6 +1115,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	rep = (io_info & SVM_IOIO_REP_MASK) != 0;
 	down = (svm->vmcb->save.rflags & X86_EFLAGS_DF) != 0;
 
+	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7623eb7b68d5..816d23185fb8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2687,6 +2687,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	rep = (exit_qualification & 32) != 0;
 	port = exit_qualification >> 16;
 
+	skip_emulated_instruction(vcpu);
 	return kvm_emulate_pio(vcpu, kvm_run, in, size, port);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ceeac8897143..38f79b6aaf1e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2478,8 +2478,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
 	val = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	memcpy(vcpu->arch.pio_data, &val, 4);
 
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
-
 	pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
 	if (pio_dev) {
 		kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 57d7cc45be44..8f60ace13874 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1772,7 +1772,7 @@ special_insn:
 			c->eip = saved_eip;
 			goto cannot_emulate;
 		}
-		return 0;
+		break;
 	case 0xf4:              /* hlt */
 		ctxt->vcpu->arch.halt_request = 1;
 		break;

From 1d5a4d9b92028d9fe77da34037bd5a1ebfecc733 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Wed, 29 Oct 2008 09:39:42 +0100
Subject: [PATCH 033/140] KVM: VMX: Handle mmio emulation when guest state is
 invalid

If emulate_invalid_guest_state is enabled, the emulator is called
when guest state is invalid.  Until now, we reported an mmio failure
when emulate_instruction() returned EMULATE_DO_MMIO.  This patch adds
the case where emulate_instruction() failed and an MMIO emulation
is needed.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 816d23185fb8..427dbc14fae9 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3052,16 +3052,12 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	while (!guest_state_valid(vcpu)) {
 		err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
 
-		switch (err) {
-			case EMULATE_DONE:
-				break;
-			case EMULATE_DO_MMIO:
-				kvm_report_emulation_failure(vcpu, "mmio");
-				/* TODO: Handle MMIO */
-				return;
-			default:
-				kvm_report_emulation_failure(vcpu, "emulation failure");
-				return;
+		if (err == EMULATE_DO_MMIO)
+			break;
+
+		if (err != EMULATE_DONE) {
+			kvm_report_emulation_failure(vcpu, "emulation failure");
+			return;
 		}
 
 		if (signal_pending(current))
@@ -3073,8 +3069,10 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
 	local_irq_disable();
 	preempt_disable();
 
-	/* Guest state should be valid now, no more emulation should be needed */
-	vmx->emulation_required = 0;
+	/* Guest state should be valid now except if we need to
+	 * emulate an MMIO */
+	if (guest_state_valid(vcpu))
+		vmx->emulation_required = 0;
 }
 
 /*
@@ -3121,6 +3119,11 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
 		    (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
 
+	/* If we need to emulate an MMIO from handle_invalid_guest_state
+	 * we just return 0 */
+	if (vmx->emulation_required && emulate_invalid_guest_state)
+		return 0;
+
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
 	if (vm_need_ept() && is_paging(vcpu)) {

From a917f7af3905953329361d29b6db78eb17b4d44c Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 23 Oct 2008 14:56:44 +0800
Subject: [PATCH 034/140] KVM: ia64: Re-organize data sturure of guests' data
 area

1. Increase the size of data area to 64M
2. Support more vcpus and memory, 128 vcpus and 256G memory are supported
   for guests.
3. Add the boundary check for memory and vcpu allocation.

With this patch, kvm guest's data area looks as follow:
  *
  *            +----------------------+  ------- KVM_VM_DATA_SIZE
  *            |     vcpu[n]'s data   |   |     ___________________KVM_STK_OFFSET
  *            |                      |   |    /                   |
  *            |        ..........    |   |   /vcpu's struct&stack |
  *            |        ..........    |   |  /---------------------|---- 0
  *            |     vcpu[5]'s data   |   | /       vpd            |
  *            |     vcpu[4]'s data   |   |/-----------------------|
  *            |     vcpu[3]'s data   |   /         vtlb           |
  *            |     vcpu[2]'s data   |  /|------------------------|
  *            |     vcpu[1]'s data   |/  |         vhpt           |
  *            |     vcpu[0]'s data   |____________________________|
  *            +----------------------+   |
  *            |    memory dirty log  |   |
  *            +----------------------+   |
  *            |    vm's data struct  |   |
  *            +----------------------+   |
  *            |                      |   |
  *            |                      |   |
  *            |                      |   |
  *            |                      |   |
  *            |                      |   |
  *            |                      |   |
  *            |                      |   |
  *            |   vm's p2m table  |      |
  *            |                      |   |
  *            |                      |   |
  *            |                      |   |  |
  * vm's data->|                      |   |  |
  *            +----------------------+ ------- 0
  * To support large memory, needs to increase the size of p2m.
  * To support more vcpus, needs to ensure it has enough space to
  * hold vcpus' data.
  */

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h | 172 ++++++++++++++++++++-----------
 arch/ia64/kvm/kvm-ia64.c         |  60 +++++------
 arch/ia64/kvm/kvm_minstate.h     |   4 +-
 arch/ia64/kvm/misc.h             |   3 +-
 arch/ia64/kvm/vcpu.c             |   5 +-
 arch/ia64/kvm/vtlb.c             |   4 +-
 6 files changed, 151 insertions(+), 97 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index c60d324da540..678e2646a500 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -23,17 +23,6 @@
 #ifndef __ASM_KVM_HOST_H
 #define __ASM_KVM_HOST_H
 
-
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/kvm.h>
-#include <linux/kvm_para.h>
-#include <linux/kvm_types.h>
-
-#include <asm/pal.h>
-#include <asm/sal.h>
-
-#define KVM_MAX_VCPUS 4
 #define KVM_MEMORY_SLOTS 32
 /* memory slots that does not exposed to userspace */
 #define KVM_PRIVATE_MEM_SLOTS 4
@@ -52,68 +41,127 @@
 #define EXIT_REASON_PTC_G		8
 
 /*Define vmm address space and vm data space.*/
-#define KVM_VMM_SIZE (16UL<<20)
+#define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
 #define KVM_VMM_SHIFT 24
-#define KVM_VMM_BASE 0xD000000000000000UL
-#define VMM_SIZE (8UL<<20)
+#define KVM_VMM_BASE 0xD000000000000000
+#define VMM_SIZE (__IA64_UL_CONST(8)<<20)
 
 /*
  * Define vm_buffer, used by PAL Services, base address.
- * Note: vmbuffer is in the VMM-BLOCK, the size must be < 8M
+ * Note: vm_buffer is in the VMM-BLOCK, the size must be < 8M
  */
 #define KVM_VM_BUFFER_BASE (KVM_VMM_BASE + VMM_SIZE)
-#define KVM_VM_BUFFER_SIZE (8UL<<20)
+#define KVM_VM_BUFFER_SIZE (__IA64_UL_CONST(8)<<20)
 
-/*Define Virtual machine data layout.*/
-#define KVM_VM_DATA_SHIFT  24
-#define KVM_VM_DATA_SIZE (1UL << KVM_VM_DATA_SHIFT)
-#define KVM_VM_DATA_BASE (KVM_VMM_BASE + KVM_VMM_SIZE)
+/*
+ * kvm guest's data area looks as follow:
+ *
+ *            +----------------------+	-------	KVM_VM_DATA_SIZE
+ *	      |	    vcpu[n]'s data   |	 |     ___________________KVM_STK_OFFSET
+ *     	      |			     |	 |    /			  |
+ *     	      |	       ..........    |	 |   /vcpu's struct&stack |
+ *     	      |	       ..........    |	 |  /---------------------|---- 0
+ *	      |	    vcpu[5]'s data   |	 | /	   vpd		  |
+ *	      |	    vcpu[4]'s data   |	 |/-----------------------|
+ *	      |	    vcpu[3]'s data   |	 /	   vtlb		  |
+ *	      |	    vcpu[2]'s data   |	/|------------------------|
+ *	      |	    vcpu[1]'s data   |/  |	   vhpt		  |
+ *	      |	    vcpu[0]'s data   |____________________________|
+ *            +----------------------+	 |
+ *	      |	   memory dirty log  |	 |
+ *            +----------------------+	 |
+ *	      |	   vm's data struct  |	 |
+ *            +----------------------+	 |
+ *	      |			     |	 |
+ *	      |			     |	 |
+ *	      |			     |	 |
+ *	      |			     |	 |
+ *	      |			     |	 |
+ *	      |			     |	 |
+ *	      |			     |	 |
+ *	      |	  vm's p2m table  |	 |
+ *	      |			     |	 |
+ *            |			     |	 |
+ *	      |			     |	 |  |
+ * vm's data->|			     |   |  |
+ *	      +----------------------+ ------- 0
+ * To support large memory, needs to increase the size of p2m.
+ * To support more vcpus, needs to ensure it has enough space to
+ * hold vcpus' data.
+ */
 
+#define KVM_VM_DATA_SHIFT	26
+#define KVM_VM_DATA_SIZE	(__IA64_UL_CONST(1) << KVM_VM_DATA_SHIFT)
+#define KVM_VM_DATA_BASE	(KVM_VMM_BASE + KVM_VM_DATA_SIZE)
 
-#define KVM_P2M_BASE    KVM_VM_DATA_BASE
-#define KVM_P2M_OFS     0
-#define KVM_P2M_SIZE    (8UL << 20)
+#define KVM_P2M_BASE		KVM_VM_DATA_BASE
+#define KVM_P2M_SIZE		(__IA64_UL_CONST(24) << 20)
 
-#define KVM_VHPT_BASE   (KVM_P2M_BASE + KVM_P2M_SIZE)
-#define KVM_VHPT_OFS    KVM_P2M_SIZE
-#define KVM_VHPT_BLOCK_SIZE   (2UL << 20)
-#define VHPT_SHIFT      18
-#define VHPT_SIZE       (1UL << VHPT_SHIFT)
-#define VHPT_NUM_ENTRIES (1<<(VHPT_SHIFT-5))
+#define VHPT_SHIFT		16
+#define VHPT_SIZE		(__IA64_UL_CONST(1) << VHPT_SHIFT)
+#define VHPT_NUM_ENTRIES	(__IA64_UL_CONST(1) << (VHPT_SHIFT-5))
 
-#define KVM_VTLB_BASE   (KVM_VHPT_BASE+KVM_VHPT_BLOCK_SIZE)
-#define KVM_VTLB_OFS    (KVM_VHPT_OFS+KVM_VHPT_BLOCK_SIZE)
-#define KVM_VTLB_BLOCK_SIZE   (1UL<<20)
-#define VTLB_SHIFT      17
-#define VTLB_SIZE       (1UL<<VTLB_SHIFT)
-#define VTLB_NUM_ENTRIES (1<<(VTLB_SHIFT-5))
+#define VTLB_SHIFT		16
+#define VTLB_SIZE		(__IA64_UL_CONST(1) << VTLB_SHIFT)
+#define VTLB_NUM_ENTRIES	(1UL << (VHPT_SHIFT-5))
 
-#define KVM_VPD_BASE   (KVM_VTLB_BASE+KVM_VTLB_BLOCK_SIZE)
-#define KVM_VPD_OFS    (KVM_VTLB_OFS+KVM_VTLB_BLOCK_SIZE)
-#define KVM_VPD_BLOCK_SIZE   (2UL<<20)
-#define VPD_SHIFT       16
-#define VPD_SIZE        (1UL<<VPD_SHIFT)
+#define VPD_SHIFT		16
+#define VPD_SIZE		(__IA64_UL_CONST(1) << VPD_SHIFT)
 
-#define KVM_VCPU_BASE   (KVM_VPD_BASE+KVM_VPD_BLOCK_SIZE)
-#define KVM_VCPU_OFS    (KVM_VPD_OFS+KVM_VPD_BLOCK_SIZE)
-#define KVM_VCPU_BLOCK_SIZE   (2UL<<20)
-#define VCPU_SHIFT 18
-#define VCPU_SIZE (1UL<<VCPU_SHIFT)
-#define MAX_VCPU_NUM KVM_VCPU_BLOCK_SIZE/VCPU_SIZE
+#define VCPU_STRUCT_SHIFT	16
+#define VCPU_STRUCT_SIZE	(__IA64_UL_CONST(1) << VCPU_STRUCT_SHIFT)
 
-#define KVM_VM_BASE     (KVM_VCPU_BASE+KVM_VCPU_BLOCK_SIZE)
-#define KVM_VM_OFS      (KVM_VCPU_OFS+KVM_VCPU_BLOCK_SIZE)
-#define KVM_VM_BLOCK_SIZE     (1UL<<19)
+#define KVM_STK_OFFSET		VCPU_STRUCT_SIZE
 
-#define KVM_MEM_DIRTY_LOG_BASE (KVM_VM_BASE+KVM_VM_BLOCK_SIZE)
-#define KVM_MEM_DIRTY_LOG_OFS  (KVM_VM_OFS+KVM_VM_BLOCK_SIZE)
-#define KVM_MEM_DIRTY_LOG_SIZE (1UL<<19)
+#define KVM_VM_STRUCT_SHIFT	19
+#define KVM_VM_STRUCT_SIZE	(__IA64_UL_CONST(1) << KVM_VM_STRUCT_SHIFT)
 
-/* Get vpd, vhpt, tlb, vcpu, base*/
-#define VPD_ADDR(n) (KVM_VPD_BASE+n*VPD_SIZE)
-#define VHPT_ADDR(n) (KVM_VHPT_BASE+n*VHPT_SIZE)
-#define VTLB_ADDR(n) (KVM_VTLB_BASE+n*VTLB_SIZE)
-#define VCPU_ADDR(n) (KVM_VCPU_BASE+n*VCPU_SIZE)
+#define KVM_MEM_DIRY_LOG_SHIFT	19
+#define KVM_MEM_DIRTY_LOG_SIZE (__IA64_UL_CONST(1) << KVM_MEM_DIRY_LOG_SHIFT)
+
+#ifndef __ASSEMBLY__
+
+/*Define the max vcpus and memory for Guests.*/
+#define KVM_MAX_VCPUS	(KVM_VM_DATA_SIZE - KVM_P2M_SIZE - KVM_VM_STRUCT_SIZE -\
+			KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data)
+#define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT)
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/kvm.h>
+#include <linux/kvm_para.h>
+#include <linux/kvm_types.h>
+
+#include <asm/pal.h>
+#include <asm/sal.h>
+#include <asm/page.h>
+
+struct kvm_vcpu_data {
+	char vcpu_vhpt[VHPT_SIZE];
+	char vcpu_vtlb[VTLB_SIZE];
+	char vcpu_vpd[VPD_SIZE];
+	char vcpu_struct[VCPU_STRUCT_SIZE];
+};
+
+struct kvm_vm_data {
+	char kvm_p2m[KVM_P2M_SIZE];
+	char kvm_vm_struct[KVM_VM_STRUCT_SIZE];
+	char kvm_mem_dirty_log[KVM_MEM_DIRTY_LOG_SIZE];
+	struct kvm_vcpu_data vcpu_data[KVM_MAX_VCPUS];
+};
+
+#define VCPU_BASE(n)	KVM_VM_DATA_BASE + \
+				offsetof(struct kvm_vm_data, vcpu_data[n])
+#define VM_BASE		KVM_VM_DATA_BASE + \
+				offsetof(struct kvm_vm_data, kvm_vm_struct)
+#define KVM_MEM_DIRTY_LOG_BASE	KVM_VM_DATA_BASE + \
+				offsetof(struct kvm_vm_data, kvm_mem_dirty_log)
+
+#define VHPT_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vhpt))
+#define VTLB_BASE(n) (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vtlb))
+#define VPD_BASE(n)  (VCPU_BASE(n) + offsetof(struct kvm_vcpu_data, vcpu_vpd))
+#define VCPU_STRUCT_BASE(n)	(VCPU_BASE(n) + \
+				offsetof(struct kvm_vcpu_data, vcpu_struct))
 
 /*IO section definitions*/
 #define IOREQ_READ      1
@@ -403,14 +451,13 @@ struct kvm_sal_data {
 };
 
 struct kvm_arch {
+	spinlock_t dirty_log_lock;
+
 	unsigned long	vm_base;
 	unsigned long	metaphysical_rr0;
 	unsigned long	metaphysical_rr4;
 	unsigned long	vmm_init_rr;
-	unsigned long	vhpt_base;
-	unsigned long	vtlb_base;
-	unsigned long 	vpd_base;
-	spinlock_t dirty_log_lock;
+
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
 	struct kvm_sal_data rdv_sal_data;
@@ -512,7 +559,7 @@ struct kvm_pt_regs {
 
 static inline struct kvm_pt_regs *vcpu_regs(struct kvm_vcpu *v)
 {
-	return (struct kvm_pt_regs *) ((unsigned long) v + IA64_STK_OFFSET) - 1;
+	return (struct kvm_pt_regs *) ((unsigned long) v + KVM_STK_OFFSET) - 1;
 }
 
 typedef int kvm_vmm_entry(void);
@@ -531,5 +578,6 @@ int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
 static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {}
+#endif /* __ASSEMBLY__*/
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index af1464f7a6ad..43e45f6afcda 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -698,27 +698,24 @@ out:
 	return r;
 }
 
-/*
- * Allocate 16M memory for every vm to hold its specific data.
- * Its memory map is defined in kvm_host.h.
- */
 static struct kvm *kvm_alloc_kvm(void)
 {
 
 	struct kvm *kvm;
 	uint64_t  vm_base;
 
+	BUG_ON(sizeof(struct kvm) > KVM_VM_STRUCT_SIZE);
+
 	vm_base = __get_free_pages(GFP_KERNEL, get_order(KVM_VM_DATA_SIZE));
 
 	if (!vm_base)
 		return ERR_PTR(-ENOMEM);
-	printk(KERN_DEBUG"kvm: VM data's base Address:0x%lx\n", vm_base);
 
-	/* Zero all pages before use! */
 	memset((void *)vm_base, 0, KVM_VM_DATA_SIZE);
-
-	kvm = (struct kvm *)(vm_base + KVM_VM_OFS);
+	kvm = (struct kvm *)(vm_base +
+			offsetof(struct kvm_vm_data, kvm_vm_struct));
 	kvm->arch.vm_base = vm_base;
+	printk(KERN_DEBUG"kvm: vm's data area:0x%lx\n", vm_base);
 
 	return kvm;
 }
@@ -760,21 +757,12 @@ static void kvm_build_io_pmt(struct kvm *kvm)
 
 static void kvm_init_vm(struct kvm *kvm)
 {
-	long vm_base;
-
 	BUG_ON(!kvm);
 
 	kvm->arch.metaphysical_rr0 = GUEST_PHYSICAL_RR0;
 	kvm->arch.metaphysical_rr4 = GUEST_PHYSICAL_RR4;
 	kvm->arch.vmm_init_rr = VMM_INIT_RR;
 
-	vm_base = kvm->arch.vm_base;
-	if (vm_base) {
-		kvm->arch.vhpt_base = vm_base + KVM_VHPT_OFS;
-		kvm->arch.vtlb_base = vm_base + KVM_VTLB_OFS;
-		kvm->arch.vpd_base  = vm_base + KVM_VPD_OFS;
-	}
-
 	/*
 	 *Fill P2M entries for MMIO/IO ranges
 	 */
@@ -864,7 +852,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 		goto out;
 	r = copy_from_user(vcpu + 1, regs->saved_stack +
 			sizeof(struct kvm_vcpu),
-			IA64_STK_OFFSET - sizeof(struct kvm_vcpu));
+			KVM_STK_OFFSET - sizeof(struct kvm_vcpu));
 	if (r)
 		goto out;
 	vcpu->arch.exit_data =
@@ -1166,10 +1154,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		/*Set entry address for first run.*/
 		regs->cr_iip = PALE_RESET_ENTRY;
 
-		/*Initilize itc offset for vcpus*/
+		/*Initialize itc offset for vcpus*/
 		itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
-		for (i = 0; i < MAX_VCPU_NUM; i++) {
-			v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+		for (i = 0; i < KVM_MAX_VCPUS; i++) {
+			v = (struct kvm_vcpu *)((char *)vcpu +
+					sizeof(struct kvm_vcpu_data) * i);
 			v->arch.itc_offset = itc_offset;
 			v->arch.last_itc = 0;
 		}
@@ -1183,7 +1172,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.apic->vcpu = vcpu;
 
 	p_ctx->gr[1] = 0;
-	p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + IA64_STK_OFFSET);
+	p_ctx->gr[12] = (unsigned long)((char *)vmm_vcpu + KVM_STK_OFFSET);
 	p_ctx->gr[13] = (unsigned long)vmm_vcpu;
 	p_ctx->psr = 0x1008522000UL;
 	p_ctx->ar[40] = FPSR_DEFAULT; /*fpsr*/
@@ -1218,12 +1207,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.hlt_timer.function = hlt_timer_fn;
 
 	vcpu->arch.last_run_cpu = -1;
-	vcpu->arch.vpd = (struct vpd *)VPD_ADDR(vcpu->vcpu_id);
+	vcpu->arch.vpd = (struct vpd *)VPD_BASE(vcpu->vcpu_id);
 	vcpu->arch.vsa_base = kvm_vsa_base;
 	vcpu->arch.__gp = kvm_vmm_gp;
 	vcpu->arch.dirty_log_lock_pa = __pa(&kvm->arch.dirty_log_lock);
-	vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_ADDR(vcpu->vcpu_id);
-	vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_ADDR(vcpu->vcpu_id);
+	vcpu->arch.vhpt.hash = (struct thash_data *)VHPT_BASE(vcpu->vcpu_id);
+	vcpu->arch.vtlb.hash = (struct thash_data *)VTLB_BASE(vcpu->vcpu_id);
 	init_ptce_info(vcpu);
 
 	r = 0;
@@ -1273,12 +1262,22 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	int r;
 	int cpu;
 
+	BUG_ON(sizeof(struct kvm_vcpu) > VCPU_STRUCT_SIZE/2);
+
+	r = -EINVAL;
+	if (id >= KVM_MAX_VCPUS) {
+		printk(KERN_ERR"kvm: Can't configure vcpus > %ld",
+				KVM_MAX_VCPUS);
+		goto fail;
+	}
+
 	r = -ENOMEM;
 	if (!vm_base) {
 		printk(KERN_ERR"kvm: Create vcpu[%d] error!\n", id);
 		goto fail;
 	}
-	vcpu = (struct kvm_vcpu *)(vm_base + KVM_VCPU_OFS + VCPU_SIZE * id);
+	vcpu = (struct kvm_vcpu *)(vm_base + offsetof(struct kvm_vm_data,
+					vcpu_data[id].vcpu_struct));
 	vcpu->kvm = kvm;
 
 	cpu = get_cpu();
@@ -1396,7 +1395,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 					sizeof(union context));
 	if (r)
 		goto out;
-	r = copy_to_user(regs->saved_stack, (void *)vcpu, IA64_STK_OFFSET);
+	r = copy_to_user(regs->saved_stack, (void *)vcpu, KVM_STK_OFFSET);
 	if (r)
 		goto out;
 	SAVE_REGS(mp_state);
@@ -1457,6 +1456,9 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
 	unsigned long base_gfn = memslot->base_gfn;
 
+	if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT))
+		return -ENOMEM;
+
 	for (i = 0; i < npages; i++) {
 		pfn = gfn_to_pfn(kvm, base_gfn + i);
 		if (!kvm_is_mmio_pfn(pfn)) {
@@ -1631,8 +1633,8 @@ static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	int r, i;
 	long n, base;
-	unsigned long *dirty_bitmap = (unsigned long *)((void *)kvm - KVM_VM_OFS
-					+ KVM_MEM_DIRTY_LOG_OFS);
+	unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
+			offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
 
 	r = -EINVAL;
 	if (log->slot >= KVM_MEMORY_SLOTS)
diff --git a/arch/ia64/kvm/kvm_minstate.h b/arch/ia64/kvm/kvm_minstate.h
index 2cc41d17cf99..b2bcaa2787aa 100644
--- a/arch/ia64/kvm/kvm_minstate.h
+++ b/arch/ia64/kvm/kvm_minstate.h
@@ -24,6 +24,8 @@
 #include <asm/asmmacro.h>
 #include <asm/types.h>
 #include <asm/kregs.h>
+#include <asm/kvm_host.h>
+
 #include "asm-offsets.h"
 
 #define KVM_MINSTATE_START_SAVE_MIN	     					\
@@ -33,7 +35,7 @@
 	addl r22 = VMM_RBS_OFFSET,r1;            /* compute base of RBS */	\
 	;;									\
 	lfetch.fault.excl.nt1 [r22];						\
-	addl r1 = IA64_STK_OFFSET-VMM_PT_REGS_SIZE,r1;  /* compute base of memory stack */  \
+	addl r1 = KVM_STK_OFFSET-VMM_PT_REGS_SIZE, r1;  \
 	mov r23 = ar.bspstore;			/* save ar.bspstore */          \
 	;;									\
 	mov ar.bspstore = r22;				/* switch to kernel RBS */\
diff --git a/arch/ia64/kvm/misc.h b/arch/ia64/kvm/misc.h
index e585c4607344..dd979e00b574 100644
--- a/arch/ia64/kvm/misc.h
+++ b/arch/ia64/kvm/misc.h
@@ -27,7 +27,8 @@
  */
 static inline uint64_t *kvm_host_get_pmt(struct kvm *kvm)
 {
-	return (uint64_t *)(kvm->arch.vm_base + KVM_P2M_OFS);
+	return (uint64_t *)(kvm->arch.vm_base +
+				offsetof(struct kvm_vm_data, kvm_p2m));
 }
 
 static inline void kvm_set_pmt_entry(struct kvm *kvm, gfn_t gfn,
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index e44027ce5667..a528d70a820c 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -816,8 +816,9 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 	unsigned long vitv = VCPU(vcpu, itv);
 
 	if (vcpu->vcpu_id == 0) {
-		for (i = 0; i < MAX_VCPU_NUM; i++) {
-			v = (struct kvm_vcpu *)((char *)vcpu + VCPU_SIZE * i);
+		for (i = 0; i < KVM_MAX_VCPUS; i++) {
+			v = (struct kvm_vcpu *)((char *)vcpu +
+					sizeof(struct kvm_vcpu_data) * i);
 			VMX(v, itc_offset) = itc_offset;
 			VMX(v, last_itc) = 0;
 		}
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index e22b93361e08..6b6307a3bd55 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -183,8 +183,8 @@ void mark_pages_dirty(struct kvm_vcpu *v, u64 pte, u64 ps)
 	u64 i, dirty_pages = 1;
 	u64 base_gfn = (pte&_PAGE_PPN_MASK) >> PAGE_SHIFT;
 	spinlock_t *lock = __kvm_va(v->arch.dirty_log_lock_pa);
-	void *dirty_bitmap = (void *)v - (KVM_VCPU_OFS + v->vcpu_id * VCPU_SIZE)
-						+ KVM_MEM_DIRTY_LOG_OFS;
+	void *dirty_bitmap = (void *)KVM_MEM_DIRTY_LOG_BASE;
+
 	dirty_pages <<= ps <= PAGE_SHIFT ? 0 : ps - PAGE_SHIFT;
 
 	vmm_spin_lock(lock);

From 853dafb62b386a3a75808483a120998e734eb6e1 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 23 Oct 2008 15:03:38 +0800
Subject: [PATCH 035/140] KVM: ia64: Remove lock held by halted vcpu

Remove the lock protection for kvm halt logic, otherwise,
once other vcpus want to acquire the lock, and they have to
wait all vcpus are waken up from halt.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 43e45f6afcda..70eb829767f4 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -439,7 +439,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 		expires = div64_u64(itc_diff, cyc_per_usec);
 		kt = ktime_set(0, 1000 * expires);
 
-		down_read(&vcpu->kvm->slots_lock);
 		vcpu->arch.ht_active = 1;
 		hrtimer_start(p_ht, kt, HRTIMER_MODE_ABS);
 
@@ -452,7 +451,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 			if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
 				vcpu->arch.mp_state =
 					KVM_MP_STATE_RUNNABLE;
-		up_read(&vcpu->kvm->slots_lock);
 
 		if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
 			return -EINTR;

From 6eb55818c043b097c83828da8430fcb9a02fdb89 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Fri, 31 Oct 2008 12:37:41 +0800
Subject: [PATCH 036/140] KVM: Enable Function Level Reset for assigned device

Ideally, every assigned device should in a clear condition before and after
assignment, so that the former state of device won't affect later work.
Some devices provide a mechanism named Function Level Reset, which is
defined in PCI/PCI-e document. We should execute it before and after device
assignment.

(But sadly, the feature is new, and most device on the market now don't
support it. We are considering using D0/D3hot transmit to emulate it later,
but not that elegant and reliable as FLR itself.)

[Update: Reminded by Xiantao, execute FLR after we ensure that the device can
be assigned to the guest.]

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c  | 2 +-
 virt/kvm/kvm_main.c | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 38f79b6aaf1e..9a4a39cfe6ef 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4148,8 +4148,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-	kvm_iommu_unmap_guest(kvm);
 	kvm_free_all_assigned_devices(kvm);
+	kvm_iommu_unmap_guest(kvm);
 	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4f43abe198e4..1838052f3c9e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -152,6 +152,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 		 */
 		kvm_put_kvm(kvm);
 
+	pci_reset_function(assigned_dev->dev);
+
 	pci_release_regions(assigned_dev->dev);
 	pci_disable_device(assigned_dev->dev);
 	pci_dev_put(assigned_dev->dev);
@@ -283,6 +285,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 		       __func__);
 		goto out_disable;
 	}
+
+	pci_reset_function(dev);
+
 	match->assigned_dev_id = assigned_dev->assigned_dev_id;
 	match->host_busnr = assigned_dev->busnr;
 	match->host_devfn = assigned_dev->devfn;

From 2843099fee32a6020e1caa95c6026f28b5d43bff Mon Sep 17 00:00:00 2001
From: Izik Eidus <ieidus@redhat.com>
Date: Fri, 3 Oct 2008 17:40:32 +0300
Subject: [PATCH 037/140] KVM: MMU: Fix aliased gfns treated as unaliased

Some areas of kvm x86 mmu are using gfn offset inside a slot without
unaliasing the gfn first.  This patch makes sure that the gfn will be
unaliased and add gfn_to_memslot_unaliased() to save the calculating
of the gfn unaliasing in case we have it unaliased already.

Signed-off-by: Izik Eidus <ieidus@redhat.com>
Acked-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 14 ++++++++++----
 virt/kvm/kvm_main.c             |  9 +++++----
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 09e6c56572cb..99e3cc149d21 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -617,6 +617,8 @@ void kvm_disable_tdp(void);
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
 int complete_pio(struct kvm_vcpu *vcpu);
 
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
+
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8687758b5295..8904e8ada978 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -386,7 +386,9 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	int *write_count;
 
-	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+	gfn = unalias_gfn(kvm, gfn);
+	write_count = slot_largepage_idx(gfn,
+					 gfn_to_memslot_unaliased(kvm, gfn));
 	*write_count += 1;
 }
 
@@ -394,16 +396,20 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 {
 	int *write_count;
 
-	write_count = slot_largepage_idx(gfn, gfn_to_memslot(kvm, gfn));
+	gfn = unalias_gfn(kvm, gfn);
+	write_count = slot_largepage_idx(gfn,
+					 gfn_to_memslot_unaliased(kvm, gfn));
 	*write_count -= 1;
 	WARN_ON(*write_count < 0);
 }
 
 static int has_wrprotected_page(struct kvm *kvm, gfn_t gfn)
 {
-	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
+	struct kvm_memory_slot *slot;
 	int *largepage_idx;
 
+	gfn = unalias_gfn(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
 	if (slot) {
 		largepage_idx = slot_largepage_idx(gfn, slot);
 		return *largepage_idx;
@@ -2973,8 +2979,8 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (sp->role.metaphysical)
 			continue;
 
-		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
 		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
+		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 		if (*rmapp)
 			printk(KERN_ERR "%s: (%s) shadow page has writable"
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1838052f3c9e..a65baa9039d5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -923,7 +923,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
-static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 {
 	int i;
 
@@ -936,11 +936,12 @@ static struct kvm_memory_slot *__gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
 
 struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 	gfn = unalias_gfn(kvm, gfn);
-	return __gfn_to_memslot(kvm, gfn);
+	return gfn_to_memslot_unaliased(kvm, gfn);
 }
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
@@ -964,7 +965,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memory_slot *slot;
 
 	gfn = unalias_gfn(kvm, gfn);
-	slot = __gfn_to_memslot(kvm, gfn);
+	slot = gfn_to_memslot_unaliased(kvm, gfn);
 	if (!slot)
 		return bad_hva();
 	return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
@@ -1215,7 +1216,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memory_slot *memslot;
 
 	gfn = unalias_gfn(kvm, gfn);
-	memslot = __gfn_to_memslot(kvm, gfn);
+	memslot = gfn_to_memslot_unaliased(kvm, gfn);
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 

From a0d7b9f246074fab1f42678d203ef4ba281505f2 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:11 -0600
Subject: [PATCH 038/140] KVM: ppc: Move 440-specific TLB code into 44x_tlb.c

This will make it easier to provide implementations for other cores.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_ppc.h |   4 +-
 arch/powerpc/kvm/44x_tlb.c         | 138 ++++++++++++++++++++++++++++-
 arch/powerpc/kvm/booke_guest.c     |  26 ------
 arch/powerpc/kvm/emulate.c         | 120 ++-----------------------
 4 files changed, 145 insertions(+), 143 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index bb62ad876de3..4adb4a397508 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -58,11 +58,11 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
                                       struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws);
+extern int kvmppc_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc);
 
 extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
                            u64 asid, u32 flags);
-extern void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                                  gva_t eend, u32 asid);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index ad72c6f9811f..dd75ab84e04e 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -32,6 +32,34 @@
 
 static unsigned int kvmppc_tlb_44x_pos;
 
+#ifdef DEBUG
+void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_44x_tlbe *tlbe;
+	int i;
+
+	printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
+	printk("| %2s | %3s | %8s | %8s | %8s |\n",
+			"nr", "tid", "word0", "word1", "word2");
+
+	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+		tlbe = &vcpu->arch.guest_tlb[i];
+		if (tlbe->word0 & PPC44x_TLB_VALID)
+			printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
+			       i, tlbe->tid, tlbe->word0, tlbe->word1,
+			       tlbe->word2);
+	}
+
+	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+		tlbe = &vcpu->arch.shadow_tlb[i];
+		if (tlbe->word0 & PPC44x_TLB_VALID)
+			printk(" S%2d | %02X | %08X | %08X | %08X |\n",
+			       i, tlbe->tid, tlbe->word0, tlbe->word1,
+			       tlbe->word2);
+	}
+}
+#endif
+
 static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 {
 	/* Mask off reserved bits. */
@@ -191,8 +219,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 			handler);
 }
 
-void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                           gva_t eend, u32 asid)
+static void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
+                                  gva_t eend, u32 asid)
 {
 	unsigned int pid = !(asid & 0xff);
 	int i;
@@ -249,3 +277,109 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 
 	vcpu->arch.shadow_pid = !usermode;
 }
+
+static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
+                             const struct tlbe *tlbe)
+{
+	gpa_t gpa;
+
+	if (!get_tlb_v(tlbe))
+		return 0;
+
+	/* Does it match current guest AS? */
+	/* XXX what about IS != DS? */
+	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
+		return 0;
+
+	gpa = get_tlb_raddr(tlbe);
+	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
+		/* Mapping is not for RAM. */
+		return 0;
+
+	return 1;
+}
+
+int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
+{
+	u64 eaddr;
+	u64 raddr;
+	u64 asid;
+	u32 flags;
+	struct tlbe *tlbe;
+	unsigned int index;
+
+	index = vcpu->arch.gpr[ra];
+	if (index > PPC44x_TLB_SIZE) {
+		printk("%s: index %d\n", __func__, index);
+		kvmppc_dump_vcpu(vcpu);
+		return EMULATE_FAIL;
+	}
+
+	tlbe = &vcpu->arch.guest_tlb[index];
+
+	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
+	if (tlbe->word0 & PPC44x_TLB_VALID) {
+		eaddr = get_tlb_eaddr(tlbe);
+		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
+		kvmppc_mmu_invalidate(vcpu, eaddr, get_tlb_end(tlbe), asid);
+	}
+
+	switch (ws) {
+	case PPC44x_TLB_PAGEID:
+		tlbe->tid = vcpu->arch.mmucr & 0xff;
+		tlbe->word0 = vcpu->arch.gpr[rs];
+		break;
+
+	case PPC44x_TLB_XLAT:
+		tlbe->word1 = vcpu->arch.gpr[rs];
+		break;
+
+	case PPC44x_TLB_ATTRIB:
+		tlbe->word2 = vcpu->arch.gpr[rs];
+		break;
+
+	default:
+		return EMULATE_FAIL;
+	}
+
+	if (tlbe_is_host_safe(vcpu, tlbe)) {
+		eaddr = get_tlb_eaddr(tlbe);
+		raddr = get_tlb_raddr(tlbe);
+		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
+		flags = tlbe->word2 & 0xffff;
+
+		/* Create a 4KB mapping on the host. If the guest wanted a
+		 * large page, only the first 4KB is mapped here and the rest
+		 * are mapped on the fly. */
+		kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
+	}
+
+	KVMTRACE_5D(GTLB_WRITE, vcpu, index,
+	            tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
+	            handler);
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
+{
+	u32 ea;
+	int index;
+	unsigned int as = get_mmucr_sts(vcpu);
+	unsigned int pid = get_mmucr_stid(vcpu);
+
+	ea = vcpu->arch.gpr[rb];
+	if (ra)
+		ea += vcpu->arch.gpr[ra];
+
+	index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
+	if (rc) {
+		if (index < 0)
+			vcpu->arch.cr &= ~0x20000000;
+		else
+			vcpu->arch.cr |= 0x20000000;
+	}
+	vcpu->arch.gpr[rt] = index;
+
+	return EMULATE_DONE;
+}
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index 7b2591e26bae..c0f8532630ec 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -111,32 +111,6 @@ const unsigned char priority_exception[] = {
 };
 
 
-void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
-{
-	struct tlbe *tlbe;
-	int i;
-
-	printk("vcpu %d TLB dump:\n", vcpu->vcpu_id);
-	printk("| %2s | %3s | %8s | %8s | %8s |\n",
-			"nr", "tid", "word0", "word1", "word2");
-
-	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		tlbe = &vcpu->arch.guest_tlb[i];
-		if (tlbe->word0 & PPC44x_TLB_VALID)
-			printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
-			       i, tlbe->tid, tlbe->word0, tlbe->word1,
-			       tlbe->word2);
-	}
-
-	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		tlbe = &vcpu->arch.shadow_tlb[i];
-		if (tlbe->word0 & PPC44x_TLB_VALID)
-			printk(" S%2d | %02X | %08X | %08X | %08X |\n",
-			       i, tlbe->tid, tlbe->word0, tlbe->word1,
-			       tlbe->word2);
-	}
-}
-
 /* TODO: use vcpu_printf() */
 void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 0fce4fbdc20d..0ce8ed539bae 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -29,8 +29,6 @@
 #include <asm/byteorder.h>
 #include <asm/kvm_ppc.h>
 
-#include "44x_tlb.h"
-
 /* Instruction decoding */
 static inline unsigned int get_op(u32 inst)
 {
@@ -87,96 +85,6 @@ static inline unsigned int get_d(u32 inst)
 	return inst & 0xffff;
 }
 
-static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
-                             const struct tlbe *tlbe)
-{
-	gpa_t gpa;
-
-	if (!get_tlb_v(tlbe))
-		return 0;
-
-	/* Does it match current guest AS? */
-	/* XXX what about IS != DS? */
-	if (get_tlb_ts(tlbe) != !!(vcpu->arch.msr & MSR_IS))
-		return 0;
-
-	gpa = get_tlb_raddr(tlbe);
-	if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT))
-		/* Mapping is not for RAM. */
-		return 0;
-
-	return 1;
-}
-
-static int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u32 inst)
-{
-	u64 eaddr;
-	u64 raddr;
-	u64 asid;
-	u32 flags;
-	struct tlbe *tlbe;
-	unsigned int ra;
-	unsigned int rs;
-	unsigned int ws;
-	unsigned int index;
-
-	ra = get_ra(inst);
-	rs = get_rs(inst);
-	ws = get_ws(inst);
-
-	index = vcpu->arch.gpr[ra];
-	if (index > PPC44x_TLB_SIZE) {
-		printk("%s: index %d\n", __func__, index);
-		kvmppc_dump_vcpu(vcpu);
-		return EMULATE_FAIL;
-	}
-
-	tlbe = &vcpu->arch.guest_tlb[index];
-
-	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
-	if (tlbe->word0 & PPC44x_TLB_VALID) {
-		eaddr = get_tlb_eaddr(tlbe);
-		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-		kvmppc_mmu_invalidate(vcpu, eaddr, get_tlb_end(tlbe), asid);
-	}
-
-	switch (ws) {
-	case PPC44x_TLB_PAGEID:
-		tlbe->tid = vcpu->arch.mmucr & 0xff;
-		tlbe->word0 = vcpu->arch.gpr[rs];
-		break;
-
-	case PPC44x_TLB_XLAT:
-		tlbe->word1 = vcpu->arch.gpr[rs];
-		break;
-
-	case PPC44x_TLB_ATTRIB:
-		tlbe->word2 = vcpu->arch.gpr[rs];
-		break;
-
-	default:
-		return EMULATE_FAIL;
-	}
-
-	if (tlbe_is_host_safe(vcpu, tlbe)) {
-		eaddr = get_tlb_eaddr(tlbe);
-		raddr = get_tlb_raddr(tlbe);
-		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-		flags = tlbe->word2 & 0xffff;
-
-		/* Create a 4KB mapping on the host. If the guest wanted a
-		 * large page, only the first 4KB is mapped here and the rest
-		 * are mapped on the fly. */
-		kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
-	}
-
-	KVMTRACE_5D(GTLB_WRITE, vcpu, index,
-			tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
-			handler);
-
-	return EMULATE_DONE;
-}
-
 static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.tcr & TCR_DIE) {
@@ -222,6 +130,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int rc;
 	int rs;
 	int rt;
+	int ws;
 	int sprn;
 	int dcrn;
 	enum emulation_result emulated = EMULATE_DONE;
@@ -630,33 +539,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			break;
 
 		case 978:                                       /* tlbwe */
-			emulated = kvmppc_emul_tlbwe(vcpu, inst);
+			ra = get_ra(inst);
+			rs = get_rs(inst);
+			ws = get_ws(inst);
+			emulated = kvmppc_emul_tlbwe(vcpu, ra, rs, ws);
 			break;
 
-		case 914:       {                               /* tlbsx */
-			int index;
-			unsigned int as = get_mmucr_sts(vcpu);
-			unsigned int pid = get_mmucr_stid(vcpu);
-
+		case 914:                                       /* tlbsx */
 			rt = get_rt(inst);
 			ra = get_ra(inst);
 			rb = get_rb(inst);
 			rc = get_rc(inst);
-
-			ea = vcpu->arch.gpr[rb];
-			if (ra)
-				ea += vcpu->arch.gpr[ra];
-
-			index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
-			if (rc) {
-				if (index < 0)
-					vcpu->arch.cr &= ~0x20000000;
-				else
-					vcpu->arch.cr |= 0x20000000;
-			}
-			vcpu->arch.gpr[rt] = index;
-
-			}
+			emulated = kvmppc_emul_tlbsx(vcpu, rt, ra, rb, rc);
 			break;
 
 		case 790:                                       /* lhbrx */

From 0f55dc481ea5c4f87fc0161cb1b8c6e2cafae8fc Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:12 -0600
Subject: [PATCH 039/140] KVM: ppc: Rename "struct tlbe" to "struct
 kvmppc_44x_tlbe"

This will ease ports to other cores.

Also remove unused "struct kvm_tlb" while we're at it.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h |  6 +++---
 arch/powerpc/include/asm/kvm_ppc.h  |  5 -----
 arch/powerpc/kernel/asm-offsets.c   |  2 +-
 arch/powerpc/kvm/44x_tlb.c          | 22 ++++++++++++----------
 arch/powerpc/kvm/44x_tlb.h          | 24 +++++++++++++-----------
 arch/powerpc/kvm/booke_guest.c      |  8 ++++----
 6 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 34b52b7180cd..df733511d671 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -64,7 +64,7 @@ struct kvm_vcpu_stat {
 	u32 halt_wakeup;
 };
 
-struct tlbe {
+struct kvmppc_44x_tlbe {
 	u32 tid; /* Only the low 8 bits are used. */
 	u32 word0;
 	u32 word1;
@@ -76,9 +76,9 @@ struct kvm_arch {
 
 struct kvm_vcpu_arch {
 	/* Unmodified copy of the guest's TLB. */
-	struct tlbe guest_tlb[PPC44x_TLB_SIZE];
+	struct kvmppc_44x_tlbe guest_tlb[PPC44x_TLB_SIZE];
 	/* TLB that's actually used when the guest is running. */
-	struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
+	struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
 	/* Pages which are referenced in the shadow TLB. */
 	struct page *shadow_pages[PPC44x_TLB_SIZE];
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4adb4a397508..39daeaa82b53 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -29,11 +29,6 @@
 #include <linux/kvm_types.h>
 #include <linux/kvm_host.h>
 
-struct kvm_tlb {
-	struct tlbe guest_tlb[PPC44x_TLB_SIZE];
-	struct tlbe shadow_tlb[PPC44x_TLB_SIZE];
-};
-
 enum emulation_result {
 	EMULATE_DONE,         /* no further processing */
 	EMULATE_DO_MMIO,      /* kvm_run filled with MMIO request */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 661d07d2146b..0264c97e02b5 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -357,7 +357,7 @@ int main(void)
 	DEFINE(PTE_SIZE, sizeof(pte_t));
 
 #ifdef CONFIG_KVM
-	DEFINE(TLBE_BYTES, sizeof(struct tlbe));
+	DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
 
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index dd75ab84e04e..5152fe5b2a9b 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -86,7 +86,7 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
 
 	/* XXX Replace loop with fancy data structures. */
 	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		struct tlbe *tlbe = &vcpu->arch.guest_tlb[i];
+		struct kvmppc_44x_tlbe *tlbe = &vcpu->arch.guest_tlb[i];
 		unsigned int tid;
 
 		if (eaddr < get_tlb_eaddr(tlbe))
@@ -111,7 +111,8 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
 	return -1;
 }
 
-struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
+                                               gva_t eaddr)
 {
 	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
 	unsigned int index;
@@ -122,7 +123,8 @@ struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
 	return &vcpu->arch.guest_tlb[index];
 }
 
-struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
+struct kvmppc_44x_tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu,
+                                               gva_t eaddr)
 {
 	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
 	unsigned int index;
@@ -133,7 +135,7 @@ struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr)
 	return &vcpu->arch.guest_tlb[index];
 }
 
-static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
+static int kvmppc_44x_tlbe_is_writable(struct kvmppc_44x_tlbe *tlbe)
 {
 	return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
 }
@@ -141,7 +143,7 @@ static int kvmppc_44x_tlbe_is_writable(struct tlbe *tlbe)
 static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
                                       unsigned int index)
 {
-	struct tlbe *stlbe = &vcpu->arch.shadow_tlb[index];
+	struct kvmppc_44x_tlbe *stlbe = &vcpu->arch.shadow_tlb[index];
 	struct page *page = vcpu->arch.shadow_pages[index];
 
 	if (get_tlb_v(stlbe)) {
@@ -171,7 +173,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
                     u32 flags)
 {
 	struct page *new_page;
-	struct tlbe *stlbe;
+	struct kvmppc_44x_tlbe *stlbe;
 	hpa_t hpaddr;
 	unsigned int victim;
 
@@ -227,7 +229,7 @@ static void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
 	/* XXX Replace loop with fancy data structures. */
 	for (i = 0; i <= tlb_44x_hwater; i++) {
-		struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+		struct kvmppc_44x_tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
 		unsigned int tid;
 
 		if (!get_tlb_v(stlbe))
@@ -262,7 +264,7 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 	if (vcpu->arch.swap_pid) {
 		/* XXX Replace loop with fancy data structures. */
 		for (i = 0; i <= tlb_44x_hwater; i++) {
-			struct tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+			struct kvmppc_44x_tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
 
 			/* Future optimization: clear only userspace mappings. */
 			kvmppc_44x_shadow_release(vcpu, i);
@@ -279,7 +281,7 @@ void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 }
 
 static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
-                             const struct tlbe *tlbe)
+                             const struct kvmppc_44x_tlbe *tlbe)
 {
 	gpa_t gpa;
 
@@ -305,7 +307,7 @@ int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	u64 raddr;
 	u64 asid;
 	u32 flags;
-	struct tlbe *tlbe;
+	struct kvmppc_44x_tlbe *tlbe;
 	unsigned int index;
 
 	index = vcpu->arch.gpr[ra];
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
index 2ccd46b6f6b7..e5b0a76798bd 100644
--- a/arch/powerpc/kvm/44x_tlb.h
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -25,48 +25,50 @@
 
 extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
                                 unsigned int pid, unsigned int as);
-extern struct tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
-extern struct tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern struct kvmppc_44x_tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu,
+                                                      gva_t eaddr);
+extern struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
+                                                      gva_t eaddr);
 
 /* TLB helper functions */
-static inline unsigned int get_tlb_size(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_size(const struct kvmppc_44x_tlbe *tlbe)
 {
 	return (tlbe->word0 >> 4) & 0xf;
 }
 
-static inline gva_t get_tlb_eaddr(const struct tlbe *tlbe)
+static inline gva_t get_tlb_eaddr(const struct kvmppc_44x_tlbe *tlbe)
 {
 	return tlbe->word0 & 0xfffffc00;
 }
 
-static inline gva_t get_tlb_bytes(const struct tlbe *tlbe)
+static inline gva_t get_tlb_bytes(const struct kvmppc_44x_tlbe *tlbe)
 {
 	unsigned int pgsize = get_tlb_size(tlbe);
 	return 1 << 10 << (pgsize << 1);
 }
 
-static inline gva_t get_tlb_end(const struct tlbe *tlbe)
+static inline gva_t get_tlb_end(const struct kvmppc_44x_tlbe *tlbe)
 {
 	return get_tlb_eaddr(tlbe) + get_tlb_bytes(tlbe) - 1;
 }
 
-static inline u64 get_tlb_raddr(const struct tlbe *tlbe)
+static inline u64 get_tlb_raddr(const struct kvmppc_44x_tlbe *tlbe)
 {
 	u64 word1 = tlbe->word1;
 	return ((word1 & 0xf) << 32) | (word1 & 0xfffffc00);
 }
 
-static inline unsigned int get_tlb_tid(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_tid(const struct kvmppc_44x_tlbe *tlbe)
 {
 	return tlbe->tid & 0xff;
 }
 
-static inline unsigned int get_tlb_ts(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_ts(const struct kvmppc_44x_tlbe *tlbe)
 {
 	return (tlbe->word0 >> 8) & 0x1;
 }
 
-static inline unsigned int get_tlb_v(const struct tlbe *tlbe)
+static inline unsigned int get_tlb_v(const struct kvmppc_44x_tlbe *tlbe)
 {
 	return (tlbe->word0 >> 9) & 0x1;
 }
@@ -81,7 +83,7 @@ static inline unsigned int get_mmucr_sts(const struct kvm_vcpu *vcpu)
 	return (vcpu->arch.mmucr >> 16) & 0x1;
 }
 
-static inline gpa_t tlb_xlate(struct tlbe *tlbe, gva_t eaddr)
+static inline gpa_t tlb_xlate(struct kvmppc_44x_tlbe *tlbe, gva_t eaddr)
 {
 	unsigned int pgmask = get_tlb_bytes(tlbe) - 1;
 
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke_guest.c
index c0f8532630ec..41bbf4c78f88 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke_guest.c
@@ -307,7 +307,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_DTLB_MISS: {
-		struct tlbe *gtlbe;
+		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.fault_dear;
 		gfn_t gfn;
 
@@ -347,7 +347,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	}
 
 	case BOOKE_INTERRUPT_ITLB_MISS: {
-		struct tlbe *gtlbe;
+		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.pc;
 		gfn_t gfn;
 
@@ -442,7 +442,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-	struct tlbe *tlbe = &vcpu->arch.guest_tlb[0];
+	struct kvmppc_44x_tlbe *tlbe = &vcpu->arch.guest_tlb[0];
 
 	tlbe->tid = 0;
 	tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
@@ -553,7 +553,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                   struct kvm_translation *tr)
 {
-	struct tlbe *gtlbe;
+	struct kvmppc_44x_tlbe *gtlbe;
 	int index;
 	gva_t eaddr;
 	u8 pid;

From d9fbd03d240380826c0ec16f927242be24ff6265 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:13 -0600
Subject: [PATCH 040/140] KVM: ppc: combine booke_guest.c and booke_host.c

The division was somewhat artificial and cumbersome, and had no functional
benefit anyways: we can only guests built for the real host processor.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/Kconfig                    |  6 +-
 arch/powerpc/kvm/Makefile                   |  6 +-
 arch/powerpc/kvm/{booke_guest.c => booke.c} | 60 +++++++++++++++
 arch/powerpc/kvm/booke_host.c               | 83 ---------------------
 4 files changed, 66 insertions(+), 89 deletions(-)
 rename arch/powerpc/kvm/{booke_guest.c => booke.c} (90%)
 delete mode 100644 arch/powerpc/kvm/booke_host.c

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 53aaa66b25e5..ffed96f817f7 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,7 @@ config KVM
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
 	# We can only run on Book E hosts so far
-	select KVM_BOOKE_HOST
+	select KVM_BOOKE
 	---help---
 	  Support hosting virtualized guest machines. You will also
 	  need to select one or more of the processor modules below.
@@ -30,8 +30,8 @@ config KVM
 
 	  If unsure, say N.
 
-config KVM_BOOKE_HOST
-	bool "KVM host support for Book E PowerPC processors"
+config KVM_BOOKE
+	bool "KVM support for Book E PowerPC processors"
 	depends on KVM && 44x
 	---help---
 	  Provides host support for KVM on Book E PowerPC processors. Currently
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index 2a5d4397ac4b..a7f857446c8a 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -8,10 +8,10 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 
 common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 
-kvm-objs := $(common-objs-y) powerpc.o emulate.o booke_guest.o
+kvm-objs := $(common-objs-y) powerpc.o emulate.o
 obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
-kvm-booke-host-objs := booke_host.o booke_interrupts.o 44x_tlb.o
-obj-$(CONFIG_KVM_BOOKE_HOST) += kvm-booke-host.o
+kvm-booke-objs := booke.o booke_interrupts.o 44x_tlb.o
+obj-$(CONFIG_KVM_BOOKE) += kvm-booke.o
diff --git a/arch/powerpc/kvm/booke_guest.c b/arch/powerpc/kvm/booke.c
similarity index 90%
rename from arch/powerpc/kvm/booke_guest.c
rename to arch/powerpc/kvm/booke.c
index 41bbf4c78f88..b1e90a15155a 100644
--- a/arch/powerpc/kvm/booke_guest.c
+++ b/arch/powerpc/kvm/booke.c
@@ -27,9 +27,12 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include <asm/cacheflush.h>
 
 #include "44x_tlb.h"
 
+unsigned long kvmppc_booke_handlers;
+
 #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
@@ -577,3 +580,60 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 
 	return 0;
 }
+
+static int kvmppc_booke_init(void)
+{
+	unsigned long ivor[16];
+	unsigned long max_ivor = 0;
+	int i;
+
+	/* We install our own exception handlers by hijacking IVPR. IVPR must
+	 * be 16-bit aligned, so we need a 64KB allocation. */
+	kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
+	                                         VCPU_SIZE_ORDER);
+	if (!kvmppc_booke_handlers)
+		return -ENOMEM;
+
+	/* XXX make sure our handlers are smaller than Linux's */
+
+	/* Copy our interrupt handlers to match host IVORs. That way we don't
+	 * have to swap the IVORs on every guest/host transition. */
+	ivor[0] = mfspr(SPRN_IVOR0);
+	ivor[1] = mfspr(SPRN_IVOR1);
+	ivor[2] = mfspr(SPRN_IVOR2);
+	ivor[3] = mfspr(SPRN_IVOR3);
+	ivor[4] = mfspr(SPRN_IVOR4);
+	ivor[5] = mfspr(SPRN_IVOR5);
+	ivor[6] = mfspr(SPRN_IVOR6);
+	ivor[7] = mfspr(SPRN_IVOR7);
+	ivor[8] = mfspr(SPRN_IVOR8);
+	ivor[9] = mfspr(SPRN_IVOR9);
+	ivor[10] = mfspr(SPRN_IVOR10);
+	ivor[11] = mfspr(SPRN_IVOR11);
+	ivor[12] = mfspr(SPRN_IVOR12);
+	ivor[13] = mfspr(SPRN_IVOR13);
+	ivor[14] = mfspr(SPRN_IVOR14);
+	ivor[15] = mfspr(SPRN_IVOR15);
+
+	for (i = 0; i < 16; i++) {
+		if (ivor[i] > max_ivor)
+			max_ivor = ivor[i];
+
+		memcpy((void *)kvmppc_booke_handlers + ivor[i],
+		       kvmppc_handlers_start + i * kvmppc_handler_len,
+		       kvmppc_handler_len);
+	}
+	flush_icache_range(kvmppc_booke_handlers,
+	                   kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
+
+	return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+}
+
+static void __exit kvmppc_booke_exit(void)
+{
+	free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
+	kvm_exit();
+}
+
+module_init(kvmppc_booke_init)
+module_exit(kvmppc_booke_exit)
diff --git a/arch/powerpc/kvm/booke_host.c b/arch/powerpc/kvm/booke_host.c
deleted file mode 100644
index b480341bc31e..000000000000
--- a/arch/powerpc/kvm/booke_host.c
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- *
- * Copyright IBM Corp. 2008
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-
-#include <linux/errno.h>
-#include <linux/kvm_host.h>
-#include <linux/module.h>
-#include <asm/cacheflush.h>
-#include <asm/kvm_ppc.h>
-
-unsigned long kvmppc_booke_handlers;
-
-static int kvmppc_booke_init(void)
-{
-	unsigned long ivor[16];
-	unsigned long max_ivor = 0;
-	int i;
-
-	/* We install our own exception handlers by hijacking IVPR. IVPR must
-	 * be 16-bit aligned, so we need a 64KB allocation. */
-	kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO,
-	                                         VCPU_SIZE_ORDER);
-	if (!kvmppc_booke_handlers)
-		return -ENOMEM;
-
-	/* XXX make sure our handlers are smaller than Linux's */
-
-	/* Copy our interrupt handlers to match host IVORs. That way we don't
-	 * have to swap the IVORs on every guest/host transition. */
-	ivor[0] = mfspr(SPRN_IVOR0);
-	ivor[1] = mfspr(SPRN_IVOR1);
-	ivor[2] = mfspr(SPRN_IVOR2);
-	ivor[3] = mfspr(SPRN_IVOR3);
-	ivor[4] = mfspr(SPRN_IVOR4);
-	ivor[5] = mfspr(SPRN_IVOR5);
-	ivor[6] = mfspr(SPRN_IVOR6);
-	ivor[7] = mfspr(SPRN_IVOR7);
-	ivor[8] = mfspr(SPRN_IVOR8);
-	ivor[9] = mfspr(SPRN_IVOR9);
-	ivor[10] = mfspr(SPRN_IVOR10);
-	ivor[11] = mfspr(SPRN_IVOR11);
-	ivor[12] = mfspr(SPRN_IVOR12);
-	ivor[13] = mfspr(SPRN_IVOR13);
-	ivor[14] = mfspr(SPRN_IVOR14);
-	ivor[15] = mfspr(SPRN_IVOR15);
-
-	for (i = 0; i < 16; i++) {
-		if (ivor[i] > max_ivor)
-			max_ivor = ivor[i];
-
-		memcpy((void *)kvmppc_booke_handlers + ivor[i],
-		       kvmppc_handlers_start + i * kvmppc_handler_len,
-		       kvmppc_handler_len);
-	}
-	flush_icache_range(kvmppc_booke_handlers,
-	                   kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
-
-	return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
-}
-
-static void __exit kvmppc_booke_exit(void)
-{
-	free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
-	kvm_exit();
-}
-
-module_init(kvmppc_booke_init)
-module_exit(kvmppc_booke_exit)

From 9dd921cfea734409a931ccc6eafd7f09850311e9 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:14 -0600
Subject: [PATCH 041/140] KVM: ppc: Refactor powerpc.c to relocate 440-specific
 code

This introduces a set of core-provided hooks. For 440, some of these are
implemented by booke.c, with the rest in (the new) 44x.c.

Note that these hooks are link-time, not run-time. Since it is not possible to
build a single kernel for both e500 and 440 (for example), using function
pointers would only add overhead.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h |   3 +
 arch/powerpc/include/asm/kvm_ppc.h  |  34 ++++----
 arch/powerpc/kvm/44x.c              | 123 ++++++++++++++++++++++++++++
 arch/powerpc/kvm/44x_tlb.h          |   1 +
 arch/powerpc/kvm/Kconfig            |  11 +--
 arch/powerpc/kvm/Makefile           |   4 +-
 arch/powerpc/kvm/booke.c            |  61 ++++++++++----
 arch/powerpc/kvm/emulate.c          |   2 +-
 arch/powerpc/kvm/powerpc.c          |  98 ++--------------------
 9 files changed, 207 insertions(+), 130 deletions(-)
 create mode 100644 arch/powerpc/kvm/44x.c

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index df733511d671..f5850d7d57a5 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -74,6 +74,9 @@ struct kvmppc_44x_tlbe {
 struct kvm_arch {
 };
 
+/* XXX Can't include mmu-44x.h because it redefines struct mm_context. */
+#define PPC44x_TLB_SIZE 64
+
 struct kvm_vcpu_arch {
 	/* Unmodified copy of the guest's TLB. */
 	struct kvmppc_44x_tlbe guest_tlb[PPC44x_TLB_SIZE];
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 39daeaa82b53..96d5de90ac5a 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -61,23 +61,6 @@ extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
-/* XXX Book E specific */
-extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
-
-extern void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu);
-
-static inline void kvmppc_queue_exception(struct kvm_vcpu *vcpu, int exception)
-{
-	unsigned int priority = exception_priority[exception];
-	set_bit(priority, &vcpu->arch.pending_exceptions);
-}
-
-static inline void kvmppc_clear_exception(struct kvm_vcpu *vcpu, int exception)
-{
-	unsigned int priority = exception_priority[exception];
-	clear_bit(priority, &vcpu->arch.pending_exceptions);
-}
-
 /* Helper function for "full" MSR writes. No need to call this if only EE is
  * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
@@ -99,6 +82,23 @@ static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
 	}
 }
 
+/* Core-specific hooks */
+
+extern int kvmppc_core_check_processor_compat(void);
+
+extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
+extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
+
+extern void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu);
+
+extern void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu);
+extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
+extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                       struct kvm_interrupt *irq);
+
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
new file mode 100644
index 000000000000..fcf8c7d0af45
--- /dev/null
+++ b/arch/powerpc/kvm/44x.c
@@ -0,0 +1,123 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/reg.h>
+#include <asm/cputable.h>
+#include <asm/tlbflush.h>
+
+#include "44x_tlb.h"
+
+/* Note: clearing MSR[DE] just means that the debug interrupt will not be
+ * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
+ * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
+ * will be delivered as an "imprecise debug event" (which is indicated by
+ * DBSR[IDE].
+ */
+static void kvm44x_disable_debug_interrupts(void)
+{
+	mtmsr(mfmsr() & ~MSR_DE);
+}
+
+void kvmppc_core_load_host_debugstate(struct kvm_vcpu *vcpu)
+{
+	kvm44x_disable_debug_interrupts();
+
+	mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
+	mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
+	mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
+	mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
+	mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
+	mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
+	mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
+	mtmsr(vcpu->arch.host_msr);
+}
+
+void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
+{
+	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
+	u32 dbcr0 = 0;
+
+	vcpu->arch.host_msr = mfmsr();
+	kvm44x_disable_debug_interrupts();
+
+	/* Save host debug register state. */
+	vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
+	vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
+	vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
+	vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
+	vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
+	vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
+	vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
+
+	/* set registers up for guest */
+
+	if (dbg->bp[0]) {
+		mtspr(SPRN_IAC1, dbg->bp[0]);
+		dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
+	}
+	if (dbg->bp[1]) {
+		mtspr(SPRN_IAC2, dbg->bp[1]);
+		dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
+	}
+	if (dbg->bp[2]) {
+		mtspr(SPRN_IAC3, dbg->bp[2]);
+		dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
+	}
+	if (dbg->bp[3]) {
+		mtspr(SPRN_IAC4, dbg->bp[3]);
+		dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
+	}
+
+	mtspr(SPRN_DBCR0, dbcr0);
+	mtspr(SPRN_DBCR1, 0);
+	mtspr(SPRN_DBCR2, 0);
+}
+
+void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	int i;
+
+	/* Mark every guest entry in the shadow TLB entry modified, so that they
+	 * will all be reloaded on the next vcpu run (instead of being
+	 * demand-faulted). */
+	for (i = 0; i <= tlb_44x_hwater; i++)
+		kvmppc_tlbe_set_modified(vcpu, i);
+}
+
+void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	/* Don't leave guest TLB entries resident when being de-scheduled. */
+	/* XXX It would be nice to differentiate between heavyweight exit and
+	 * sched_out here, since we could avoid the TLB flush for heavyweight
+	 * exits. */
+	_tlbia();
+}
+
+int kvmppc_core_check_processor_compat(void)
+{
+	int r;
+
+	if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
+		r = 0;
+	else
+		r = -ENOTSUPP;
+
+	return r;
+}
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
index e5b0a76798bd..357d79ae5493 100644
--- a/arch/powerpc/kvm/44x_tlb.h
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -29,6 +29,7 @@ extern struct kvmppc_44x_tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu,
                                                       gva_t eaddr);
 extern struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
                                                       gva_t eaddr);
+extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
 
 /* TLB helper functions */
 static inline unsigned int get_tlb_size(const struct kvmppc_44x_tlbe *tlbe)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index ffed96f817f7..37e9b3c52a38 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -16,11 +16,9 @@ if VIRTUALIZATION
 
 config KVM
 	bool "Kernel-based Virtual Machine (KVM) support"
-	depends on 44x && EXPERIMENTAL
+	depends on EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
-	# We can only run on Book E hosts so far
-	select KVM_BOOKE
 	---help---
 	  Support hosting virtualized guest machines. You will also
 	  need to select one or more of the processor modules below.
@@ -30,12 +28,11 @@ config KVM
 
 	  If unsure, say N.
 
-config KVM_BOOKE
-	bool "KVM support for Book E PowerPC processors"
+config KVM_440
+	bool "KVM support for PowerPC 440 processors"
 	depends on KVM && 44x
 	---help---
-	  Provides host support for KVM on Book E PowerPC processors. Currently
-	  this works on 440 processors only.
+	  KVM can run unmodified 440 guest kernels on 440 host processors.
 
 config KVM_TRACE
 	bool "KVM trace support"
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index a7f857446c8a..f5e33756f318 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -13,5 +13,5 @@ obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
-kvm-booke-objs := booke.o booke_interrupts.o 44x_tlb.o
-obj-$(CONFIG_KVM_BOOKE) += kvm-booke.o
+kvm-440-objs := booke.o booke_interrupts.o 44x.o 44x_tlb.o
+obj-$(CONFIG_KVM_440) += kvm-440.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b1e90a15155a..138014acf3cf 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -134,6 +134,40 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void kvmppc_booke_queue_exception(struct kvm_vcpu *vcpu, int exception)
+{
+	unsigned int priority = exception_priority[exception];
+	set_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+static void kvmppc_booke_clear_exception(struct kvm_vcpu *vcpu, int exception)
+{
+	unsigned int priority = exception_priority[exception];
+	clear_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+}
+
+void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
+{
+	kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+}
+
+int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
+{
+	unsigned int priority = exception_priority[BOOKE_INTERRUPT_DECREMENTER];
+	return test_bit(priority, &vcpu->arch.pending_exceptions);
+}
+
+void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
+                                struct kvm_interrupt *irq)
+{
+	kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+}
+
 /* Check if we are ready to deliver the interrupt */
 static int kvmppc_can_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
 {
@@ -168,7 +202,7 @@ static int kvmppc_can_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
 	return r;
 }
 
-static void kvmppc_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+static void kvmppc_booke_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
 {
 	switch (interrupt) {
 	case BOOKE_INTERRUPT_DECREMENTER:
@@ -183,7 +217,7 @@ static void kvmppc_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
 }
 
 /* Check pending exceptions and deliver one, if possible. */
-void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu)
+void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
 	unsigned int exception;
@@ -193,8 +227,8 @@ void kvmppc_check_and_deliver_interrupts(struct kvm_vcpu *vcpu)
 	while (priority <= BOOKE_MAX_INTERRUPT) {
 		exception = priority_exception[priority];
 		if (kvmppc_can_deliver_interrupt(vcpu, exception)) {
-			kvmppc_clear_exception(vcpu, exception);
-			kvmppc_deliver_interrupt(vcpu, exception);
+			kvmppc_booke_clear_exception(vcpu, exception);
+			kvmppc_booke_deliver_interrupt(vcpu, exception);
 			break;
 		}
 
@@ -251,7 +285,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			/* Program traps generated by user-level software must be handled
 			 * by the guest kernel. */
 			vcpu->arch.esr = vcpu->arch.fault_esr;
-			kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+			kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
 			r = RESUME_GUEST;
 			break;
 		}
@@ -284,27 +318,27 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_FP_UNAVAIL:
-		kvmppc_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_exception(vcpu, exit_nr);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_DATA_STORAGE:
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
-		kvmppc_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_exception(vcpu, exit_nr);
 		vcpu->stat.dsi_exits++;
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_INST_STORAGE:
 		vcpu->arch.esr = vcpu->arch.fault_esr;
-		kvmppc_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_exception(vcpu, exit_nr);
 		vcpu->stat.isi_exits++;
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_SYSCALL:
-		kvmppc_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_exception(vcpu, exit_nr);
 		vcpu->stat.syscall_exits++;
 		r = RESUME_GUEST;
 		break;
@@ -318,7 +352,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gtlbe = kvmppc_44x_dtlb_search(vcpu, eaddr);
 		if (!gtlbe) {
 			/* The guest didn't have a mapping for it. */
-			kvmppc_queue_exception(vcpu, exit_nr);
+			kvmppc_booke_queue_exception(vcpu, exit_nr);
 			vcpu->arch.dear = vcpu->arch.fault_dear;
 			vcpu->arch.esr = vcpu->arch.fault_esr;
 			vcpu->stat.dtlb_real_miss_exits++;
@@ -360,7 +394,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gtlbe = kvmppc_44x_itlb_search(vcpu, eaddr);
 		if (!gtlbe) {
 			/* The guest didn't have a mapping for it. */
-			kvmppc_queue_exception(vcpu, exit_nr);
+			kvmppc_booke_queue_exception(vcpu, exit_nr);
 			vcpu->stat.itlb_real_miss_exits++;
 			break;
 		}
@@ -380,8 +414,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			               gtlbe->word2);
 		} else {
 			/* Guest mapped and leaped at non-RAM! */
-			kvmppc_queue_exception(vcpu,
-			                       BOOKE_INTERRUPT_MACHINE_CHECK);
+			kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_MACHINE_CHECK);
 		}
 
 		break;
@@ -409,7 +442,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	local_irq_disable();
 
-	kvmppc_check_and_deliver_interrupts(vcpu);
+	kvmppc_core_deliver_interrupts(vcpu);
 
 	/* Do some exit accounting. */
 	vcpu->stat.sum_exits++;
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 0ce8ed539bae..c5d2bfcf567a 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -139,7 +139,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	switch (get_op(inst)) {
 	case 3:                                                 /* trap */
 		printk("trap!\n");
-		kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+		kvmppc_core_queue_program(vcpu);
 		advance = 0;
 		break;
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8bef0efcdfe1..8d0aaf96d838 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -99,14 +99,7 @@ void kvm_arch_hardware_unsetup(void)
 
 void kvm_arch_check_processor_compat(void *rtn)
 {
-	int r;
-
-	if (strcmp(cur_cpu_spec->platform, "ppc440") == 0)
-		r = 0;
-	else
-		r = -ENOTSUPP;
-
-	*(int *)rtn = r;
+	*(int *)rtn = kvmppc_core_check_processor_compat();
 }
 
 struct kvm *kvm_arch_create_vm(void)
@@ -212,16 +205,14 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-	unsigned int priority = exception_priority[BOOKE_INTERRUPT_DECREMENTER];
-
-	return test_bit(priority, &vcpu->arch.pending_exceptions);
+	return kvmppc_core_pending_dec(vcpu);
 }
 
 static void kvmppc_decrementer_func(unsigned long data)
 {
 	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
 
-	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+	kvmppc_core_queue_dec(vcpu);
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
@@ -242,96 +233,25 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvmppc_core_destroy_mmu(vcpu);
 }
 
-/* Note: clearing MSR[DE] just means that the debug interrupt will not be
- * delivered *immediately*. Instead, it simply sets the appropriate DBSR bits.
- * If those DBSR bits are still set when MSR[DE] is re-enabled, the interrupt
- * will be delivered as an "imprecise debug event" (which is indicated by
- * DBSR[IDE].
- */
-static void kvmppc_disable_debug_interrupts(void)
-{
-	mtmsr(mfmsr() & ~MSR_DE);
-}
-
-static void kvmppc_restore_host_debug_state(struct kvm_vcpu *vcpu)
-{
-	kvmppc_disable_debug_interrupts();
-
-	mtspr(SPRN_IAC1, vcpu->arch.host_iac[0]);
-	mtspr(SPRN_IAC2, vcpu->arch.host_iac[1]);
-	mtspr(SPRN_IAC3, vcpu->arch.host_iac[2]);
-	mtspr(SPRN_IAC4, vcpu->arch.host_iac[3]);
-	mtspr(SPRN_DBCR1, vcpu->arch.host_dbcr1);
-	mtspr(SPRN_DBCR2, vcpu->arch.host_dbcr2);
-	mtspr(SPRN_DBCR0, vcpu->arch.host_dbcr0);
-	mtmsr(vcpu->arch.host_msr);
-}
-
-static void kvmppc_load_guest_debug_registers(struct kvm_vcpu *vcpu)
-{
-	struct kvm_guest_debug *dbg = &vcpu->guest_debug;
-	u32 dbcr0 = 0;
-
-	vcpu->arch.host_msr = mfmsr();
-	kvmppc_disable_debug_interrupts();
-
-	/* Save host debug register state. */
-	vcpu->arch.host_iac[0] = mfspr(SPRN_IAC1);
-	vcpu->arch.host_iac[1] = mfspr(SPRN_IAC2);
-	vcpu->arch.host_iac[2] = mfspr(SPRN_IAC3);
-	vcpu->arch.host_iac[3] = mfspr(SPRN_IAC4);
-	vcpu->arch.host_dbcr0 = mfspr(SPRN_DBCR0);
-	vcpu->arch.host_dbcr1 = mfspr(SPRN_DBCR1);
-	vcpu->arch.host_dbcr2 = mfspr(SPRN_DBCR2);
-
-	/* set registers up for guest */
-
-	if (dbg->bp[0]) {
-		mtspr(SPRN_IAC1, dbg->bp[0]);
-		dbcr0 |= DBCR0_IAC1 | DBCR0_IDM;
-	}
-	if (dbg->bp[1]) {
-		mtspr(SPRN_IAC2, dbg->bp[1]);
-		dbcr0 |= DBCR0_IAC2 | DBCR0_IDM;
-	}
-	if (dbg->bp[2]) {
-		mtspr(SPRN_IAC3, dbg->bp[2]);
-		dbcr0 |= DBCR0_IAC3 | DBCR0_IDM;
-	}
-	if (dbg->bp[3]) {
-		mtspr(SPRN_IAC4, dbg->bp[3]);
-		dbcr0 |= DBCR0_IAC4 | DBCR0_IDM;
-	}
-
-	mtspr(SPRN_DBCR0, dbcr0);
-	mtspr(SPRN_DBCR1, 0);
-	mtspr(SPRN_DBCR2, 0);
-}
-
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	int i;
-
 	if (vcpu->guest_debug.enabled)
-		kvmppc_load_guest_debug_registers(vcpu);
+		kvmppc_core_load_guest_debugstate(vcpu);
 
-	/* Mark every guest entry in the shadow TLB entry modified, so that they
-	 * will all be reloaded on the next vcpu run (instead of being
-	 * demand-faulted). */
-	for (i = 0; i <= tlb_44x_hwater; i++)
-		kvmppc_tlbe_set_modified(vcpu, i);
+	kvmppc_core_vcpu_load(vcpu, cpu);
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->guest_debug.enabled)
-		kvmppc_restore_host_debug_state(vcpu);
+		kvmppc_core_load_host_debugstate(vcpu);
 
 	/* Don't leave guest TLB entries resident when being de-scheduled. */
 	/* XXX It would be nice to differentiate between heavyweight exit and
 	 * sched_out here, since we could avoid the TLB flush for heavyweight
 	 * exits. */
 	_tlbil_all();
+	kvmppc_core_vcpu_put(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
@@ -460,7 +380,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		vcpu->arch.dcr_needed = 0;
 	}
 
-	kvmppc_check_and_deliver_interrupts(vcpu);
+	kvmppc_core_deliver_interrupts(vcpu);
 
 	local_irq_disable();
 	kvm_guest_enter();
@@ -478,7 +398,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq)
 {
-	kvmppc_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+	kvmppc_core_queue_external(vcpu, irq);
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);

From c381a04313e7c0fb04246b1ff711e0b5726de6c0 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:15 -0600
Subject: [PATCH 042/140] ppc: Create disassemble.h to extract instruction
 fields

This is used in a couple places in KVM, but isn't KVM-specific.

However, this patch doesn't modify other in-kernel emulation code:
- xmon uses a direct copy of ppc_opc.c from binutils
- emulate_instruction() doesn't need it because it can use a series
  of mask tests.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Acked-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/disassemble.h | 80 ++++++++++++++++++++++++++
 arch/powerpc/kvm/emulate.c             | 57 +-----------------
 2 files changed, 81 insertions(+), 56 deletions(-)
 create mode 100644 arch/powerpc/include/asm/disassemble.h

diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h
new file mode 100644
index 000000000000..9b198d1b3b2b
--- /dev/null
+++ b/arch/powerpc/include/asm/disassemble.h
@@ -0,0 +1,80 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __ASM_PPC_DISASSEMBLE_H__
+#define __ASM_PPC_DISASSEMBLE_H__
+
+#include <linux/types.h>
+
+static inline unsigned int get_op(u32 inst)
+{
+	return inst >> 26;
+}
+
+static inline unsigned int get_xop(u32 inst)
+{
+	return (inst >> 1) & 0x3ff;
+}
+
+static inline unsigned int get_sprn(u32 inst)
+{
+	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+
+static inline unsigned int get_dcrn(u32 inst)
+{
+	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+
+static inline unsigned int get_rt(u32 inst)
+{
+	return (inst >> 21) & 0x1f;
+}
+
+static inline unsigned int get_rs(u32 inst)
+{
+	return (inst >> 21) & 0x1f;
+}
+
+static inline unsigned int get_ra(u32 inst)
+{
+	return (inst >> 16) & 0x1f;
+}
+
+static inline unsigned int get_rb(u32 inst)
+{
+	return (inst >> 11) & 0x1f;
+}
+
+static inline unsigned int get_rc(u32 inst)
+{
+	return inst & 0x1;
+}
+
+static inline unsigned int get_ws(u32 inst)
+{
+	return (inst >> 11) & 0x1f;
+}
+
+static inline unsigned int get_d(u32 inst)
+{
+	return inst & 0xffff;
+}
+
+#endif /* __ASM_PPC_DISASSEMBLE_H__ */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index c5d2bfcf567a..5fd9cf779be5 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -28,62 +28,7 @@
 #include <asm/time.h>
 #include <asm/byteorder.h>
 #include <asm/kvm_ppc.h>
-
-/* Instruction decoding */
-static inline unsigned int get_op(u32 inst)
-{
-	return inst >> 26;
-}
-
-static inline unsigned int get_xop(u32 inst)
-{
-	return (inst >> 1) & 0x3ff;
-}
-
-static inline unsigned int get_sprn(u32 inst)
-{
-	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
-}
-
-static inline unsigned int get_dcrn(u32 inst)
-{
-	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
-}
-
-static inline unsigned int get_rt(u32 inst)
-{
-	return (inst >> 21) & 0x1f;
-}
-
-static inline unsigned int get_rs(u32 inst)
-{
-	return (inst >> 21) & 0x1f;
-}
-
-static inline unsigned int get_ra(u32 inst)
-{
-	return (inst >> 16) & 0x1f;
-}
-
-static inline unsigned int get_rb(u32 inst)
-{
-	return (inst >> 11) & 0x1f;
-}
-
-static inline unsigned int get_rc(u32 inst)
-{
-	return inst & 0x1;
-}
-
-static inline unsigned int get_ws(u32 inst)
-{
-	return (inst >> 11) & 0x1f;
-}
-
-static inline unsigned int get_d(u32 inst)
-{
-	return inst & 0xffff;
-}
+#include <asm/disassemble.h>
 
 static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {

From 75f74f0dbe086c239b4b0cc5ed75b903ea3e663f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:16 -0600
Subject: [PATCH 043/140] KVM: ppc: refactor instruction emulation into generic
 and core-specific pieces

Cores provide 3 emulation hooks, implemented for example in the new
4xx_emulate.c:
kvmppc_core_emulate_op
kvmppc_core_emulate_mtspr
kvmppc_core_emulate_mfspr

Strictly speaking the last two aren't necessary, but provide for more
informative error reporting ("unknown SPR").

Long term I'd like to have instruction decoding autogenerated from tables of
opcodes, and that way we could aggregate universal, Book E, and core-specific
instructions more easily and without redundant switch statements.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_ppc.h |  29 +--
 arch/powerpc/kvm/44x_emulate.c     | 335 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/44x_tlb.c         |   4 +-
 arch/powerpc/kvm/44x_tlb.h         |   4 +
 arch/powerpc/kvm/Makefile          |   7 +-
 arch/powerpc/kvm/booke.c           |   1 +
 arch/powerpc/kvm/booke.h           |  39 ++++
 arch/powerpc/kvm/emulate.c         | 272 ++---------------------
 8 files changed, 415 insertions(+), 276 deletions(-)
 create mode 100644 arch/powerpc/kvm/44x_emulate.c
 create mode 100644 arch/powerpc/kvm/booke.h

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 96d5de90ac5a..aecf95d5fede 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -53,35 +53,13 @@ extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_emulate_instruction(struct kvm_run *run,
                                       struct kvm_vcpu *vcpu);
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
-extern int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws);
-extern int kvmppc_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc);
+extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 
 extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
                            u64 asid, u32 flags);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
-/* Helper function for "full" MSR writes. No need to call this if only EE is
- * changing. */
-static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
-{
-	if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
-		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
-
-	vcpu->arch.msr = new_msr;
-
-	if (vcpu->arch.msr & MSR_WE)
-		kvm_vcpu_block(vcpu);
-}
-
-static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
-{
-	if (vcpu->arch.pid != new_pid) {
-		vcpu->arch.pid = new_pid;
-		vcpu->arch.swap_pid = 1;
-	}
-}
-
 /* Core-specific hooks */
 
 extern int kvmppc_core_check_processor_compat(void);
@@ -99,6 +77,11 @@ extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu);
 extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                        struct kvm_interrupt *irq);
 
+extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                                  unsigned int op, int *advance);
+extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
+extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
+
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
new file mode 100644
index 000000000000..a634c0c4fa7e
--- /dev/null
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -0,0 +1,335 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#include <asm/kvm_ppc.h>
+#include <asm/dcr.h>
+#include <asm/dcr-regs.h>
+#include <asm/disassemble.h>
+
+#include "booke.h"
+#include "44x_tlb.h"
+
+#define OP_RFI      19
+
+#define XOP_RFI     50
+#define XOP_MFMSR   83
+#define XOP_WRTEE   131
+#define XOP_MTMSR   146
+#define XOP_WRTEEI  163
+#define XOP_MFDCR   323
+#define XOP_MTDCR   451
+#define XOP_TLBSX   914
+#define XOP_ICCCI   966
+#define XOP_TLBWE   978
+
+static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
+{
+	if (vcpu->arch.pid != new_pid) {
+		vcpu->arch.pid = new_pid;
+		vcpu->arch.swap_pid = 1;
+	}
+}
+
+static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.pc = vcpu->arch.srr0;
+	kvmppc_set_msr(vcpu, vcpu->arch.srr1);
+}
+
+int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
+                           unsigned int inst, int *advance)
+{
+	int emulated = EMULATE_DONE;
+	int dcrn;
+	int ra;
+	int rb;
+	int rc;
+	int rs;
+	int rt;
+	int ws;
+
+	switch (get_op(inst)) {
+
+	case OP_RFI:
+		switch (get_xop(inst)) {
+		case XOP_RFI:
+			kvmppc_emul_rfi(vcpu);
+			*advance = 0;
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+			break;
+		}
+		break;
+
+	case 31:
+		switch (get_xop(inst)) {
+
+		case XOP_MFMSR:
+			rt = get_rt(inst);
+			vcpu->arch.gpr[rt] = vcpu->arch.msr;
+			break;
+
+		case XOP_MTMSR:
+			rs = get_rs(inst);
+			kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
+			break;
+
+		case XOP_WRTEE:
+			rs = get_rs(inst);
+			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+							 | (vcpu->arch.gpr[rs] & MSR_EE);
+			break;
+
+		case XOP_WRTEEI:
+			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
+							 | (inst & MSR_EE);
+			break;
+
+		case XOP_MFDCR:
+			dcrn = get_dcrn(inst);
+			rt = get_rt(inst);
+
+			/* The guest may access CPR0 registers to determine the timebase
+			 * frequency, and it must know the real host frequency because it
+			 * can directly access the timebase registers.
+			 *
+			 * It would be possible to emulate those accesses in userspace,
+			 * but userspace can really only figure out the end frequency.
+			 * We could decompose that into the factors that compute it, but
+			 * that's tricky math, and it's easier to just report the real
+			 * CPR0 values.
+			 */
+			switch (dcrn) {
+			case DCRN_CPR0_CONFIG_ADDR:
+				vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
+				break;
+			case DCRN_CPR0_CONFIG_DATA:
+				local_irq_disable();
+				mtdcr(DCRN_CPR0_CONFIG_ADDR,
+					  vcpu->arch.cpr0_cfgaddr);
+				vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
+				local_irq_enable();
+				break;
+			default:
+				run->dcr.dcrn = dcrn;
+				run->dcr.data =  0;
+				run->dcr.is_write = 0;
+				vcpu->arch.io_gpr = rt;
+				vcpu->arch.dcr_needed = 1;
+				emulated = EMULATE_DO_DCR;
+			}
+
+			break;
+
+		case XOP_MTDCR:
+			dcrn = get_dcrn(inst);
+			rs = get_rs(inst);
+
+			/* emulate some access in kernel */
+			switch (dcrn) {
+			case DCRN_CPR0_CONFIG_ADDR:
+				vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
+				break;
+			default:
+				run->dcr.dcrn = dcrn;
+				run->dcr.data = vcpu->arch.gpr[rs];
+				run->dcr.is_write = 1;
+				vcpu->arch.dcr_needed = 1;
+				emulated = EMULATE_DO_DCR;
+			}
+
+			break;
+
+		case XOP_TLBWE:
+			ra = get_ra(inst);
+			rs = get_rs(inst);
+			ws = get_ws(inst);
+			emulated = kvmppc_44x_emul_tlbwe(vcpu, ra, rs, ws);
+			break;
+
+		case XOP_TLBSX:
+			rt = get_rt(inst);
+			ra = get_ra(inst);
+			rb = get_rb(inst);
+			rc = get_rc(inst);
+			emulated = kvmppc_44x_emul_tlbsx(vcpu, rt, ra, rb, rc);
+			break;
+
+		case XOP_ICCCI:
+			break;
+
+		default:
+			emulated = EMULATE_FAIL;
+		}
+
+		break;
+
+	default:
+		emulated = EMULATE_FAIL;
+	}
+
+	return emulated;
+}
+
+int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
+{
+	switch (sprn) {
+	case SPRN_MMUCR:
+		vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
+	case SPRN_PID:
+		kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
+	case SPRN_CCR0:
+		vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
+	case SPRN_CCR1:
+		vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
+	case SPRN_DEAR:
+		vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
+	case SPRN_ESR:
+		vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
+	case SPRN_DBCR0:
+		vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
+	case SPRN_DBCR1:
+		vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
+	case SPRN_TSR:
+		vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
+	case SPRN_TCR:
+		vcpu->arch.tcr = vcpu->arch.gpr[rs];
+		kvmppc_emulate_dec(vcpu);
+		break;
+
+	/* Note: SPRG4-7 are user-readable. These values are
+	 * loaded into the real SPRGs when resuming the
+	 * guest. */
+	case SPRN_SPRG4:
+		vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
+	case SPRN_SPRG5:
+		vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
+	case SPRN_SPRG6:
+		vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
+	case SPRN_SPRG7:
+		vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
+
+	case SPRN_IVPR:
+		vcpu->arch.ivpr = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR0:
+		vcpu->arch.ivor[0] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR1:
+		vcpu->arch.ivor[1] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR2:
+		vcpu->arch.ivor[2] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR3:
+		vcpu->arch.ivor[3] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR4:
+		vcpu->arch.ivor[4] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR5:
+		vcpu->arch.ivor[5] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR6:
+		vcpu->arch.ivor[6] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR7:
+		vcpu->arch.ivor[7] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR8:
+		vcpu->arch.ivor[8] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR9:
+		vcpu->arch.ivor[9] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR10:
+		vcpu->arch.ivor[10] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR11:
+		vcpu->arch.ivor[11] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR12:
+		vcpu->arch.ivor[12] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR13:
+		vcpu->arch.ivor[13] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR14:
+		vcpu->arch.ivor[14] = vcpu->arch.gpr[rs]; break;
+	case SPRN_IVOR15:
+		vcpu->arch.ivor[15] = vcpu->arch.gpr[rs]; break;
+
+	default:
+		return EMULATE_FAIL;
+	}
+
+	return EMULATE_DONE;
+}
+
+int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
+{
+	switch (sprn) {
+	/* 440 */
+	case SPRN_MMUCR:
+		vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
+	case SPRN_CCR0:
+		vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
+	case SPRN_CCR1:
+		vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
+
+	/* Book E */
+	case SPRN_PID:
+		vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
+	case SPRN_IVPR:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
+	case SPRN_DEAR:
+		vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
+	case SPRN_ESR:
+		vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
+	case SPRN_DBCR0:
+		vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
+	case SPRN_DBCR1:
+		vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
+
+	case SPRN_IVOR0:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[0]; break;
+	case SPRN_IVOR1:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[1]; break;
+	case SPRN_IVOR2:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[2]; break;
+	case SPRN_IVOR3:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[3]; break;
+	case SPRN_IVOR4:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[4]; break;
+	case SPRN_IVOR5:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[5]; break;
+	case SPRN_IVOR6:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[6]; break;
+	case SPRN_IVOR7:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[7]; break;
+	case SPRN_IVOR8:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[8]; break;
+	case SPRN_IVOR9:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[9]; break;
+	case SPRN_IVOR10:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[10]; break;
+	case SPRN_IVOR11:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[11]; break;
+	case SPRN_IVOR12:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[12]; break;
+	case SPRN_IVOR13:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[13]; break;
+	case SPRN_IVOR14:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[14]; break;
+	case SPRN_IVOR15:
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[15]; break;
+	default:
+		return EMULATE_FAIL;
+	}
+
+	return EMULATE_DONE;
+}
+
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 5152fe5b2a9b..bb6da134cadb 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -301,7 +301,7 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 	return 1;
 }
 
-int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
+int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 {
 	u64 eaddr;
 	u64 raddr;
@@ -363,7 +363,7 @@ int kvmppc_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	return EMULATE_DONE;
 }
 
-int kvmppc_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
+int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
 {
 	u32 ea;
 	int index;
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
index 357d79ae5493..b1029af3de20 100644
--- a/arch/powerpc/kvm/44x_tlb.h
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -31,6 +31,10 @@ extern struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
                                                       gva_t eaddr);
 extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
 
+extern int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb,
+                                 u8 rc);
+extern int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws);
+
 /* TLB helper functions */
 static inline unsigned int get_tlb_size(const struct kvmppc_44x_tlbe *tlbe)
 {
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f5e33756f318..f045fad0f4f1 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -13,5 +13,10 @@ obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
 
-kvm-440-objs := booke.o booke_interrupts.o 44x.o 44x_tlb.o
+kvm-440-objs := \
+	booke.o \
+	booke_interrupts.o \
+	44x.o \
+	44x_tlb.o \
+	44x_emulate.o
 obj-$(CONFIG_KVM_440) += kvm-440.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 138014acf3cf..ea630095e280 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -29,6 +29,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/cacheflush.h>
 
+#include "booke.h"
 #include "44x_tlb.h"
 
 unsigned long kvmppc_booke_handlers;
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
new file mode 100644
index 000000000000..f694a4b2dafa
--- /dev/null
+++ b/arch/powerpc/kvm/booke.h
@@ -0,0 +1,39 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __KVM_BOOKE_H__
+#define __KVM_BOOKE_H__
+
+#include <linux/types.h>
+#include <linux/kvm_host.h>
+
+/* Helper function for "full" MSR writes. No need to call this if only EE is
+ * changing. */
+static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
+{
+	if ((new_msr & MSR_PR) != (vcpu->arch.msr & MSR_PR))
+		kvmppc_mmu_priv_switch(vcpu, new_msr & MSR_PR);
+
+	vcpu->arch.msr = new_msr;
+
+	if (vcpu->arch.msr & MSR_WE)
+		kvm_vcpu_block(vcpu);
+}
+
+#endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 5fd9cf779be5..30a49f8c49b2 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -23,14 +23,13 @@
 #include <linux/string.h>
 #include <linux/kvm_host.h>
 
-#include <asm/dcr.h>
-#include <asm/dcr-regs.h>
+#include <asm/reg.h>
 #include <asm/time.h>
 #include <asm/byteorder.h>
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
 
-static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
+void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
 	if (vcpu->arch.tcr & TCR_DIE) {
 		/* The decrementer ticks at the same rate as the timebase, so
@@ -46,12 +45,6 @@ static void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
-{
-	vcpu->arch.pc = vcpu->arch.srr0;
-	kvmppc_set_msr(vcpu, vcpu->arch.srr1);
-}
-
 /* XXX to do:
  * lhax
  * lhaux
@@ -66,18 +59,17 @@ static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
  *
  * XXX is_bigendian should depend on MMU mapping or MSR[LE]
  */
+/* XXX Should probably auto-generate instruction decoding for a particular core
+ * from opcode tables in the future. */
 int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 {
 	u32 inst = vcpu->arch.last_inst;
 	u32 ea;
 	int ra;
 	int rb;
-	int rc;
 	int rs;
 	int rt;
-	int ws;
 	int sprn;
-	int dcrn;
 	enum emulation_result emulated = EMULATE_DONE;
 	int advance = 1;
 
@@ -88,19 +80,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		advance = 0;
 		break;
 
-	case 19:
-		switch (get_xop(inst)) {
-		case 50:                                        /* rfi */
-			kvmppc_emul_rfi(vcpu);
-			advance = 0;
-			break;
-
-		default:
-			emulated = EMULATE_FAIL;
-			break;
-		}
-		break;
-
 	case 31:
 		switch (get_xop(inst)) {
 
@@ -109,27 +88,11 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1);
 			break;
 
-		case 83:                                        /* mfmsr */
-			rt = get_rt(inst);
-			vcpu->arch.gpr[rt] = vcpu->arch.msr;
-			break;
-
 		case 87:                                        /* lbzx */
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1);
 			break;
 
-		case 131:                                       /* wrtee */
-			rs = get_rs(inst);
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-			                 | (vcpu->arch.gpr[rs] & MSR_EE);
-			break;
-
-		case 146:                                       /* mtmsr */
-			rs = get_rs(inst);
-			kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
-			break;
-
 		case 151:                                       /* stwx */
 			rs = get_rs(inst);
 			emulated = kvmppc_handle_store(run, vcpu,
@@ -137,11 +100,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			                               4, 1);
 			break;
 
-		case 163:                                       /* wrteei */
-			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
-			                 | (inst & MSR_EE);
-			break;
-
 		case 215:                                       /* stbx */
 			rs = get_rs(inst);
 			emulated = kvmppc_handle_store(run, vcpu,
@@ -182,42 +140,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			vcpu->arch.gpr[ra] = ea;
 			break;
 
-		case 323:                                       /* mfdcr */
-			dcrn = get_dcrn(inst);
-			rt = get_rt(inst);
-
-			/* The guest may access CPR0 registers to determine the timebase
-			 * frequency, and it must know the real host frequency because it
-			 * can directly access the timebase registers.
-			 *
-			 * It would be possible to emulate those accesses in userspace,
-			 * but userspace can really only figure out the end frequency.
-			 * We could decompose that into the factors that compute it, but
-			 * that's tricky math, and it's easier to just report the real
-			 * CPR0 values.
-			 */
-			switch (dcrn) {
-			case DCRN_CPR0_CONFIG_ADDR:
-				vcpu->arch.gpr[rt] = vcpu->arch.cpr0_cfgaddr;
-				break;
-			case DCRN_CPR0_CONFIG_DATA:
-				local_irq_disable();
-				mtdcr(DCRN_CPR0_CONFIG_ADDR,
-				      vcpu->arch.cpr0_cfgaddr);
-				vcpu->arch.gpr[rt] = mfdcr(DCRN_CPR0_CONFIG_DATA);
-				local_irq_enable();
-				break;
-			default:
-				run->dcr.dcrn = dcrn;
-				run->dcr.data =  0;
-				run->dcr.is_write = 0;
-				vcpu->arch.io_gpr = rt;
-				vcpu->arch.dcr_needed = 1;
-				emulated = EMULATE_DO_DCR;
-			}
-
-			break;
-
 		case 339:                                       /* mfspr */
 			sprn = get_sprn(inst);
 			rt = get_rt(inst);
@@ -227,26 +149,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				vcpu->arch.gpr[rt] = vcpu->arch.srr0; break;
 			case SPRN_SRR1:
 				vcpu->arch.gpr[rt] = vcpu->arch.srr1; break;
-			case SPRN_MMUCR:
-				vcpu->arch.gpr[rt] = vcpu->arch.mmucr; break;
-			case SPRN_PID:
-				vcpu->arch.gpr[rt] = vcpu->arch.pid; break;
-			case SPRN_IVPR:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivpr; break;
-			case SPRN_CCR0:
-				vcpu->arch.gpr[rt] = vcpu->arch.ccr0; break;
-			case SPRN_CCR1:
-				vcpu->arch.gpr[rt] = vcpu->arch.ccr1; break;
 			case SPRN_PVR:
 				vcpu->arch.gpr[rt] = vcpu->arch.pvr; break;
-			case SPRN_DEAR:
-				vcpu->arch.gpr[rt] = vcpu->arch.dear; break;
-			case SPRN_ESR:
-				vcpu->arch.gpr[rt] = vcpu->arch.esr; break;
-			case SPRN_DBCR0:
-				vcpu->arch.gpr[rt] = vcpu->arch.dbcr0; break;
-			case SPRN_DBCR1:
-				vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
 
 			/* Note: mftb and TBRL/TBWL are user-accessible, so
 			 * the guest can always access the real TB anyways.
@@ -267,42 +171,12 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			/* Note: SPRG4-7 are user-readable, so we don't get
 			 * a trap. */
 
-			case SPRN_IVOR0:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[0]; break;
-			case SPRN_IVOR1:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[1]; break;
-			case SPRN_IVOR2:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[2]; break;
-			case SPRN_IVOR3:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[3]; break;
-			case SPRN_IVOR4:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[4]; break;
-			case SPRN_IVOR5:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[5]; break;
-			case SPRN_IVOR6:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[6]; break;
-			case SPRN_IVOR7:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[7]; break;
-			case SPRN_IVOR8:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[8]; break;
-			case SPRN_IVOR9:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[9]; break;
-			case SPRN_IVOR10:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[10]; break;
-			case SPRN_IVOR11:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[11]; break;
-			case SPRN_IVOR12:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[12]; break;
-			case SPRN_IVOR13:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[13]; break;
-			case SPRN_IVOR14:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[14]; break;
-			case SPRN_IVOR15:
-				vcpu->arch.gpr[rt] = vcpu->arch.ivor[15]; break;
-
 			default:
-				printk("mfspr: unknown spr %x\n", sprn);
-				vcpu->arch.gpr[rt] = 0;
+				emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, rt);
+				if (emulated == EMULATE_FAIL) {
+					printk("mfspr: unknown spr %x\n", sprn);
+					vcpu->arch.gpr[rt] = 0;
+				}
 				break;
 			}
 			break;
@@ -332,25 +206,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			vcpu->arch.gpr[ra] = ea;
 			break;
 
-		case 451:                                       /* mtdcr */
-			dcrn = get_dcrn(inst);
-			rs = get_rs(inst);
-
-			/* emulate some access in kernel */
-			switch (dcrn) {
-			case DCRN_CPR0_CONFIG_ADDR:
-				vcpu->arch.cpr0_cfgaddr = vcpu->arch.gpr[rs];
-				break;
-			default:
-				run->dcr.dcrn = dcrn;
-				run->dcr.data = vcpu->arch.gpr[rs];
-				run->dcr.is_write = 1;
-				vcpu->arch.dcr_needed = 1;
-				emulated = EMULATE_DO_DCR;
-			}
-
-			break;
-
 		case 467:                                       /* mtspr */
 			sprn = get_sprn(inst);
 			rs = get_rs(inst);
@@ -359,22 +214,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				vcpu->arch.srr0 = vcpu->arch.gpr[rs]; break;
 			case SPRN_SRR1:
 				vcpu->arch.srr1 = vcpu->arch.gpr[rs]; break;
-			case SPRN_MMUCR:
-				vcpu->arch.mmucr = vcpu->arch.gpr[rs]; break;
-			case SPRN_PID:
-				kvmppc_set_pid(vcpu, vcpu->arch.gpr[rs]); break;
-			case SPRN_CCR0:
-				vcpu->arch.ccr0 = vcpu->arch.gpr[rs]; break;
-			case SPRN_CCR1:
-				vcpu->arch.ccr1 = vcpu->arch.gpr[rs]; break;
-			case SPRN_DEAR:
-				vcpu->arch.dear = vcpu->arch.gpr[rs]; break;
-			case SPRN_ESR:
-				vcpu->arch.esr = vcpu->arch.gpr[rs]; break;
-			case SPRN_DBCR0:
-				vcpu->arch.dbcr0 = vcpu->arch.gpr[rs]; break;
-			case SPRN_DBCR1:
-				vcpu->arch.dbcr1 = vcpu->arch.gpr[rs]; break;
 
 			/* XXX We need to context-switch the timebase for
 			 * watchdog and FIT. */
@@ -386,14 +225,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 				kvmppc_emulate_dec(vcpu);
 				break;
 
-			case SPRN_TSR:
-				vcpu->arch.tsr &= ~vcpu->arch.gpr[rs]; break;
-
-			case SPRN_TCR:
-				vcpu->arch.tcr = vcpu->arch.gpr[rs];
-				kvmppc_emulate_dec(vcpu);
-				break;
-
 			case SPRN_SPRG0:
 				vcpu->arch.sprg0 = vcpu->arch.gpr[rs]; break;
 			case SPRN_SPRG1:
@@ -403,56 +234,10 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			case SPRN_SPRG3:
 				vcpu->arch.sprg3 = vcpu->arch.gpr[rs]; break;
 
-			/* Note: SPRG4-7 are user-readable. These values are
-			 * loaded into the real SPRGs when resuming the
-			 * guest. */
-			case SPRN_SPRG4:
-				vcpu->arch.sprg4 = vcpu->arch.gpr[rs]; break;
-			case SPRN_SPRG5:
-				vcpu->arch.sprg5 = vcpu->arch.gpr[rs]; break;
-			case SPRN_SPRG6:
-				vcpu->arch.sprg6 = vcpu->arch.gpr[rs]; break;
-			case SPRN_SPRG7:
-				vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
-
-			case SPRN_IVPR:
-				vcpu->arch.ivpr = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR0:
-				vcpu->arch.ivor[0] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR1:
-				vcpu->arch.ivor[1] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR2:
-				vcpu->arch.ivor[2] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR3:
-				vcpu->arch.ivor[3] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR4:
-				vcpu->arch.ivor[4] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR5:
-				vcpu->arch.ivor[5] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR6:
-				vcpu->arch.ivor[6] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR7:
-				vcpu->arch.ivor[7] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR8:
-				vcpu->arch.ivor[8] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR9:
-				vcpu->arch.ivor[9] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR10:
-				vcpu->arch.ivor[10] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR11:
-				vcpu->arch.ivor[11] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR12:
-				vcpu->arch.ivor[12] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR13:
-				vcpu->arch.ivor[13] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR14:
-				vcpu->arch.ivor[14] = vcpu->arch.gpr[rs]; break;
-			case SPRN_IVOR15:
-				vcpu->arch.ivor[15] = vcpu->arch.gpr[rs]; break;
-
 			default:
-				printk("mtspr: unknown spr %x\n", sprn);
-				emulated = EMULATE_FAIL;
+				emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, rs);
+				if (emulated == EMULATE_FAIL)
+					printk("mtspr: unknown spr %x\n", sprn);
 				break;
 			}
 			break;
@@ -483,21 +268,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			                               4, 0);
 			break;
 
-		case 978:                                       /* tlbwe */
-			ra = get_ra(inst);
-			rs = get_rs(inst);
-			ws = get_ws(inst);
-			emulated = kvmppc_emul_tlbwe(vcpu, ra, rs, ws);
-			break;
-
-		case 914:                                       /* tlbsx */
-			rt = get_rt(inst);
-			ra = get_ra(inst);
-			rb = get_rb(inst);
-			rc = get_rc(inst);
-			emulated = kvmppc_emul_tlbsx(vcpu, rt, ra, rb, rc);
-			break;
-
 		case 790:                                       /* lhbrx */
 			rt = get_rt(inst);
 			emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0);
@@ -513,14 +283,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			                               2, 0);
 			break;
 
-		case 966:                                       /* iccci */
-			break;
-
 		default:
-			printk("unknown: op %d xop %d\n", get_op(inst),
-				get_xop(inst));
+			/* Attempt core-specific emulation below. */
 			emulated = EMULATE_FAIL;
-			break;
 		}
 		break;
 
@@ -603,9 +368,16 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		break;
 
 	default:
-		printk("unknown op %d\n", get_op(inst));
 		emulated = EMULATE_FAIL;
-		break;
+	}
+
+	if (emulated == EMULATE_FAIL) {
+		emulated = kvmppc_core_emulate_op(run, vcpu, inst, &advance);
+		if (emulated == EMULATE_FAIL) {
+			advance = 0;
+			printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
+			       "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
+		}
 	}
 
 	KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);

From 5cbb5106f50b4515815cd32cf944958c0d4da83f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:17 -0600
Subject: [PATCH 044/140] KVM: ppc: Move the last bits of 44x code out of
 booke.c

Needed to port to other Book E processors.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_ppc.h |  3 ++
 arch/powerpc/kvm/44x.c             | 53 ++++++++++++++++++++++++++++++
 arch/powerpc/kvm/booke.c           | 46 ++------------------------
 3 files changed, 58 insertions(+), 44 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index aecf95d5fede..d59332575b4d 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -62,7 +62,10 @@ extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
 /* Core-specific hooks */
 
+extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_check_processor_compat(void);
+extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                                      struct kvm_translation *tr);
 
 extern void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index fcf8c7d0af45..f5d7028eeb09 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -121,3 +121,56 @@ int kvmppc_core_check_processor_compat(void)
 
 	return r;
 }
+
+int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_44x_tlbe *tlbe = &vcpu->arch.guest_tlb[0];
+
+	tlbe->tid = 0;
+	tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
+	tlbe->word1 = 0;
+	tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
+
+	tlbe++;
+	tlbe->tid = 0;
+	tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
+	tlbe->word1 = 0xef600000;
+	tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
+	              | PPC44x_TLB_I | PPC44x_TLB_G;
+
+	/* Since the guest can directly access the timebase, it must know the
+	 * real timebase frequency. Accordingly, it must see the state of
+	 * CCR1[TCS]. */
+	vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
+
+	return 0;
+}
+
+/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
+int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
+                               struct kvm_translation *tr)
+{
+	struct kvmppc_44x_tlbe *gtlbe;
+	int index;
+	gva_t eaddr;
+	u8 pid;
+	u8 as;
+
+	eaddr = tr->linear_address;
+	pid = (tr->linear_address >> 32) & 0xff;
+	as = (tr->linear_address >> 40) & 0x1;
+
+	index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
+	if (index == -1) {
+		tr->valid = 0;
+		return 0;
+	}
+
+	gtlbe = &vcpu->arch.guest_tlb[index];
+
+	tr->physical_address = tlb_xlate(gtlbe, eaddr);
+	/* XXX what does "writeable" and "usermode" even mean? */
+	tr->valid = 1;
+
+	return 0;
+}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ea630095e280..c619d1b912c5 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -479,20 +479,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 /* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_44x_tlbe *tlbe = &vcpu->arch.guest_tlb[0];
-
-	tlbe->tid = 0;
-	tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
-	tlbe->word1 = 0;
-	tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR;
-
-	tlbe++;
-	tlbe->tid = 0;
-	tlbe->word0 = 0xef600000 | PPC44x_TLB_4K | PPC44x_TLB_VALID;
-	tlbe->word1 = 0xef600000;
-	tlbe->word2 = PPC44x_TLB_SX | PPC44x_TLB_SW | PPC44x_TLB_SR
-	              | PPC44x_TLB_I | PPC44x_TLB_G;
-
 	vcpu->arch.pc = 0;
 	vcpu->arch.msr = 0;
 	vcpu->arch.gpr[1] = (16<<20) - 8; /* -8 for the callee-save LR slot */
@@ -503,12 +489,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	 * before it's programmed its own IVPR. */
 	vcpu->arch.ivpr = 0x55550000;
 
-	/* Since the guest can directly access the timebase, it must know the
-	 * real timebase frequency. Accordingly, it must see the state of
-	 * CCR1[TCS]. */
-	vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
-
-	return 0;
+	return kvmppc_core_vcpu_setup(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
@@ -586,33 +567,10 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	return -ENOTSUPP;
 }
 
-/* 'linear_address' is actually an encoding of AS|PID|EADDR . */
 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
                                   struct kvm_translation *tr)
 {
-	struct kvmppc_44x_tlbe *gtlbe;
-	int index;
-	gva_t eaddr;
-	u8 pid;
-	u8 as;
-
-	eaddr = tr->linear_address;
-	pid = (tr->linear_address >> 32) & 0xff;
-	as = (tr->linear_address >> 40) & 0x1;
-
-	index = kvmppc_44x_tlb_index(vcpu, eaddr, pid, as);
-	if (index == -1) {
-		tr->valid = 0;
-		return 0;
-	}
-
-	gtlbe = &vcpu->arch.guest_tlb[index];
-
-	tr->physical_address = tlb_xlate(gtlbe, eaddr);
-	/* XXX what does "writeable" and "usermode" even mean? */
-	tr->valid = 1;
-
-	return 0;
+	return kvmppc_core_vcpu_translate(vcpu, tr);
 }
 
 static int kvmppc_booke_init(void)

From db93f5745d836f81cef0b4101a7c2685eeb55efb Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:18 -0600
Subject: [PATCH 045/140] KVM: ppc: create struct kvm_vcpu_44x and introduce
 container_of() accessor

This patch doesn't yet move all 44x-specific data into the new structure, but
is the first step down that path. In the future we may also want to create a
struct kvm_vcpu_booke.

Based on patch from Liu Yu <yu.liu@freescale.com>.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_44x.h  | 47 ++++++++++++++++++++++
 arch/powerpc/include/asm/kvm_host.h | 13 ------
 arch/powerpc/include/asm/kvm_ppc.h  |  6 +++
 arch/powerpc/kernel/asm-offsets.c   | 14 ++++---
 arch/powerpc/kvm/44x.c              | 62 ++++++++++++++++++++++++++++-
 arch/powerpc/kvm/44x_tlb.c          | 37 +++++++++++------
 arch/powerpc/kvm/booke.c            |  9 ++---
 arch/powerpc/kvm/booke_interrupts.S |  6 +--
 arch/powerpc/kvm/powerpc.c          | 23 +----------
 9 files changed, 154 insertions(+), 63 deletions(-)
 create mode 100644 arch/powerpc/include/asm/kvm_44x.h

diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
new file mode 100644
index 000000000000..dece09350712
--- /dev/null
+++ b/arch/powerpc/include/asm/kvm_44x.h
@@ -0,0 +1,47 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ */
+
+#ifndef __ASM_44X_H__
+#define __ASM_44X_H__
+
+#include <linux/kvm_host.h>
+
+/* XXX Can't include mmu-44x.h because it redefines struct mm_context. */
+#define PPC44x_TLB_SIZE 64
+
+struct kvmppc_vcpu_44x {
+	/* Unmodified copy of the guest's TLB. */
+	struct kvmppc_44x_tlbe guest_tlb[PPC44x_TLB_SIZE];
+	/* TLB that's actually used when the guest is running. */
+	struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
+	/* Pages which are referenced in the shadow TLB. */
+	struct page *shadow_pages[PPC44x_TLB_SIZE];
+
+	/* Track which TLB entries we've modified in the current exit. */
+	u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+
+	struct kvm_vcpu vcpu;
+};
+
+static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu)
+{
+	return container_of(vcpu, struct kvmppc_vcpu_44x, vcpu);
+}
+
+#endif /* __ASM_44X_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index f5850d7d57a5..765d8ec8b7d6 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -74,20 +74,7 @@ struct kvmppc_44x_tlbe {
 struct kvm_arch {
 };
 
-/* XXX Can't include mmu-44x.h because it redefines struct mm_context. */
-#define PPC44x_TLB_SIZE 64
-
 struct kvm_vcpu_arch {
-	/* Unmodified copy of the guest's TLB. */
-	struct kvmppc_44x_tlbe guest_tlb[PPC44x_TLB_SIZE];
-	/* TLB that's actually used when the guest is running. */
-	struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
-	/* Pages which are referenced in the shadow TLB. */
-	struct page *shadow_pages[PPC44x_TLB_SIZE];
-
-	/* Track which TLB entries we've modified in the current exit. */
-	u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
-
 	u32 host_stack;
 	u32 host_pid;
 	u32 host_dbcr0;
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index d59332575b4d..976ecc4b322e 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -62,6 +62,9 @@ extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
 /* Core-specific hooks */
 
+extern struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm,
+                                                unsigned int id);
+extern void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu);
 extern int kvmppc_core_check_processor_compat(void);
 extern int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
@@ -85,6 +88,9 @@ extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 extern int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs);
 extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt);
 
+extern int kvmppc_booke_init(void);
+extern void kvmppc_booke_exit(void);
+
 extern void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu);
 
 #endif /* __POWERPC_KVM_PPC_H__ */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 0264c97e02b5..393c7f36a1e8 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -23,9 +23,6 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/hrtimer.h>
-#ifdef CONFIG_KVM
-#include <linux/kvm_host.h>
-#endif
 #ifdef CONFIG_PPC64
 #include <linux/time.h>
 #include <linux/hardirq.h>
@@ -51,6 +48,9 @@
 #ifdef CONFIG_PPC_ISERIES
 #include <asm/iseries/alpaca.h>
 #endif
+#ifdef CONFIG_KVM
+#include <asm/kvm_44x.h>
+#endif
 
 #if defined(CONFIG_BOOKE) || defined(CONFIG_40x)
 #include "head_booke.h"
@@ -359,10 +359,14 @@ int main(void)
 #ifdef CONFIG_KVM
 	DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
 
+	DEFINE(VCPU_TO_44X, offsetof(struct kvmppc_vcpu_44x, vcpu));
+	DEFINE(VCPU44x_SHADOW_TLB,
+	       offsetof(struct kvmppc_vcpu_44x, shadow_tlb));
+	DEFINE(VCPU44x_SHADOW_MOD,
+	       offsetof(struct kvmppc_vcpu_44x, shadow_tlb_mod));
+
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
-	DEFINE(VCPU_SHADOW_TLB, offsetof(struct kvm_vcpu, arch.shadow_tlb));
-	DEFINE(VCPU_SHADOW_MOD, offsetof(struct kvm_vcpu, arch.shadow_tlb_mod));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
 	DEFINE(VCPU_LR, offsetof(struct kvm_vcpu, arch.lr));
 	DEFINE(VCPU_CR, offsetof(struct kvm_vcpu, arch.cr));
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index f5d7028eeb09..22054b164b5a 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -18,9 +18,13 @@
  */
 
 #include <linux/kvm_host.h>
+#include <linux/err.h>
+
 #include <asm/reg.h>
 #include <asm/cputable.h>
 #include <asm/tlbflush.h>
+#include <asm/kvm_44x.h>
+#include <asm/kvm_ppc.h>
 
 #include "44x_tlb.h"
 
@@ -124,7 +128,8 @@ int kvmppc_core_check_processor_compat(void)
 
 int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 {
-	struct kvmppc_44x_tlbe *tlbe = &vcpu->arch.guest_tlb[0];
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[0];
 
 	tlbe->tid = 0;
 	tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
@@ -150,6 +155,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
                                struct kvm_translation *tr)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	struct kvmppc_44x_tlbe *gtlbe;
 	int index;
 	gva_t eaddr;
@@ -166,7 +172,7 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
 		return 0;
 	}
 
-	gtlbe = &vcpu->arch.guest_tlb[index];
+	gtlbe = &vcpu_44x->guest_tlb[index];
 
 	tr->physical_address = tlb_xlate(gtlbe, eaddr);
 	/* XXX what does "writeable" and "usermode" even mean? */
@@ -174,3 +180,55 @@ int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu,
 
 	return 0;
 }
+
+struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	struct kvmppc_vcpu_44x *vcpu_44x;
+	struct kvm_vcpu *vcpu;
+	int err;
+
+	vcpu_44x = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	if (!vcpu_44x) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	vcpu = &vcpu_44x->vcpu;
+	err = kvm_vcpu_init(vcpu, kvm, id);
+	if (err)
+		goto free_vcpu;
+
+	return vcpu;
+
+free_vcpu:
+	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
+out:
+	return ERR_PTR(err);
+}
+
+void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+
+	kvm_vcpu_uninit(vcpu);
+	kmem_cache_free(kvm_vcpu_cache, vcpu_44x);
+}
+
+static int kvmppc_44x_init(void)
+{
+	int r;
+
+	r = kvmppc_booke_init();
+	if (r)
+		return r;
+
+	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_44x), THIS_MODULE);
+}
+
+static void kvmppc_44x_exit(void)
+{
+	kvmppc_booke_exit();
+}
+
+module_init(kvmppc_44x_init);
+module_exit(kvmppc_44x_exit);
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index bb6da134cadb..8b65fbd6c57d 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -24,6 +24,7 @@
 #include <linux/highmem.h>
 #include <asm/mmu-44x.h>
 #include <asm/kvm_ppc.h>
+#include <asm/kvm_44x.h>
 
 #include "44x_tlb.h"
 
@@ -43,7 +44,7 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 			"nr", "tid", "word0", "word1", "word2");
 
 	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		tlbe = &vcpu->arch.guest_tlb[i];
+		tlbe = &vcpu_44x->guest_tlb[i];
 		if (tlbe->word0 & PPC44x_TLB_VALID)
 			printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
 			       i, tlbe->tid, tlbe->word0, tlbe->word1,
@@ -51,7 +52,7 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 	}
 
 	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		tlbe = &vcpu->arch.shadow_tlb[i];
+		tlbe = &vcpu_44x->shadow_tlb[i];
 		if (tlbe->word0 & PPC44x_TLB_VALID)
 			printk(" S%2d | %02X | %08X | %08X | %08X |\n",
 			       i, tlbe->tid, tlbe->word0, tlbe->word1,
@@ -82,11 +83,12 @@ static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
                          unsigned int as)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
 	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		struct kvmppc_44x_tlbe *tlbe = &vcpu->arch.guest_tlb[i];
+		struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[i];
 		unsigned int tid;
 
 		if (eaddr < get_tlb_eaddr(tlbe))
@@ -114,25 +116,27 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
 struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
                                                gva_t eaddr)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
 	unsigned int index;
 
 	index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 	if (index == -1)
 		return NULL;
-	return &vcpu->arch.guest_tlb[index];
+	return &vcpu_44x->guest_tlb[index];
 }
 
 struct kvmppc_44x_tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu,
                                                gva_t eaddr)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
 	unsigned int index;
 
 	index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 	if (index == -1)
 		return NULL;
-	return &vcpu->arch.guest_tlb[index];
+	return &vcpu_44x->guest_tlb[index];
 }
 
 static int kvmppc_44x_tlbe_is_writable(struct kvmppc_44x_tlbe *tlbe)
@@ -143,8 +147,9 @@ static int kvmppc_44x_tlbe_is_writable(struct kvmppc_44x_tlbe *tlbe)
 static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
                                       unsigned int index)
 {
-	struct kvmppc_44x_tlbe *stlbe = &vcpu->arch.shadow_tlb[index];
-	struct page *page = vcpu->arch.shadow_pages[index];
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[index];
+	struct page *page = vcpu_44x->shadow_pages[index];
 
 	if (get_tlb_v(stlbe)) {
 		if (kvmppc_44x_tlbe_is_writable(stlbe))
@@ -164,7 +169,9 @@ void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu)
 
 void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
 {
-    vcpu->arch.shadow_tlb_mod[i] = 1;
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+
+	vcpu_44x->shadow_tlb_mod[i] = 1;
 }
 
 /* Caller must ensure that the specified guest TLB entry is safe to insert into
@@ -172,6 +179,7 @@ void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
                     u32 flags)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	struct page *new_page;
 	struct kvmppc_44x_tlbe *stlbe;
 	hpa_t hpaddr;
@@ -182,7 +190,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	victim = kvmppc_tlb_44x_pos++;
 	if (kvmppc_tlb_44x_pos > tlb_44x_hwater)
 		kvmppc_tlb_44x_pos = 0;
-	stlbe = &vcpu->arch.shadow_tlb[victim];
+	stlbe = &vcpu_44x->shadow_tlb[victim];
 
 	/* Get reference to new page. */
 	new_page = gfn_to_page(vcpu->kvm, gfn);
@@ -196,7 +204,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	/* Drop reference to old page. */
 	kvmppc_44x_shadow_release(vcpu, victim);
 
-	vcpu->arch.shadow_pages[victim] = new_page;
+	vcpu_44x->shadow_pages[victim] = new_page;
 
 	/* XXX Make sure (va, size) doesn't overlap any other
 	 * entries. 440x6 user manual says the result would be
@@ -224,12 +232,13 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 static void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
                                   gva_t eend, u32 asid)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	unsigned int pid = !(asid & 0xff);
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
 	for (i = 0; i <= tlb_44x_hwater; i++) {
-		struct kvmppc_44x_tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+		struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
 		unsigned int tid;
 
 		if (!get_tlb_v(stlbe))
@@ -259,12 +268,13 @@ static void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
  * switching address spaces. */
 void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	int i;
 
 	if (vcpu->arch.swap_pid) {
 		/* XXX Replace loop with fancy data structures. */
 		for (i = 0; i <= tlb_44x_hwater; i++) {
-			struct kvmppc_44x_tlbe *stlbe = &vcpu->arch.shadow_tlb[i];
+			struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
 
 			/* Future optimization: clear only userspace mappings. */
 			kvmppc_44x_shadow_release(vcpu, i);
@@ -303,6 +313,7 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 
 int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	u64 eaddr;
 	u64 raddr;
 	u64 asid;
@@ -317,7 +328,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 		return EMULATE_FAIL;
 	}
 
-	tlbe = &vcpu->arch.guest_tlb[index];
+	tlbe = &vcpu_44x->guest_tlb[index];
 
 	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
 	if (tlbe->word0 & PPC44x_TLB_VALID) {
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index c619d1b912c5..883e9db5f00c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -573,7 +573,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	return kvmppc_core_vcpu_translate(vcpu, tr);
 }
 
-static int kvmppc_booke_init(void)
+int kvmppc_booke_init(void)
 {
 	unsigned long ivor[16];
 	unsigned long max_ivor = 0;
@@ -618,14 +618,11 @@ static int kvmppc_booke_init(void)
 	flush_icache_range(kvmppc_booke_handlers,
 	                   kvmppc_booke_handlers + max_ivor + kvmppc_handler_len);
 
-	return kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE);
+	return 0;
 }
 
-static void __exit kvmppc_booke_exit(void)
+void __exit kvmppc_booke_exit(void)
 {
 	free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER);
 	kvm_exit();
 }
-
-module_init(kvmppc_booke_init)
-module_exit(kvmppc_booke_exit)
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 95e165baf85f..8d6929b7fdb6 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -349,8 +349,8 @@ lightweight_exit:
 	lis	r5, tlb_44x_hwater@ha
 	lwz	r5, tlb_44x_hwater@l(r5)
 	mtctr	r5
-	addi	r9, r4, VCPU_SHADOW_TLB
-	addi	r5, r4, VCPU_SHADOW_MOD
+	addi	r9, r4, -VCPU_TO_44X + VCPU44x_SHADOW_TLB
+	addi	r5, r4, -VCPU_TO_44X + VCPU44x_SHADOW_MOD
 	li	r3, 0
 1:
 	lbzx	r7, r3, r5
@@ -377,7 +377,7 @@ lightweight_exit:
 	/* Clear bitmap of modified TLB entries */
 	li	r5, PPC44x_TLB_SIZE>>2
 	mtctr	r5
-	addi	r5, r4, VCPU_SHADOW_MOD - 4
+	addi	r5, r4, -VCPU_TO_44X + VCPU44x_SHADOW_MOD - 4
 	li	r6, 0
 1:
 	stwu	r6, 4(r5)
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8d0aaf96d838..237f3ba68d27 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -171,31 +171,12 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	struct kvm_vcpu *vcpu;
-	int err;
-
-	vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
-	if (!vcpu) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = kvm_vcpu_init(vcpu, kvm, id);
-	if (err)
-		goto free_vcpu;
-
-	return vcpu;
-
-free_vcpu:
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
-out:
-	return ERR_PTR(err);
+	return kvmppc_core_vcpu_create(kvm, id);
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
-	kvm_vcpu_uninit(vcpu);
-	kmem_cache_free(kvm_vcpu_cache, vcpu);
+	kvmppc_core_vcpu_free(vcpu);
 }
 
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)

From 5cf8ca22146fa106f3bb865631ec04f5b499508f Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:19 -0600
Subject: [PATCH 046/140] KVM: ppc: adjust vcpu types to support 64-bit cores

However, some of these fields could be split into separate per-core structures
in the future.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h | 50 ++++++++++++++---------------
 arch/powerpc/kvm/booke.c            | 10 +++---
 arch/powerpc/kvm/emulate.c          |  2 +-
 arch/powerpc/kvm/powerpc.c          |  4 +--
 4 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 765d8ec8b7d6..a4a7d5ef6cf8 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -84,32 +84,32 @@ struct kvm_vcpu_arch {
 	u32 host_msr;
 
 	u64 fpr[32];
-	u32 gpr[32];
+	ulong gpr[32];
 
-	u32 pc;
+	ulong pc;
 	u32 cr;
-	u32 ctr;
-	u32 lr;
-	u32 xer;
+	ulong ctr;
+	ulong lr;
+	ulong xer;
 
-	u32 msr;
+	ulong msr;
 	u32 mmucr;
-	u32 sprg0;
-	u32 sprg1;
-	u32 sprg2;
-	u32 sprg3;
-	u32 sprg4;
-	u32 sprg5;
-	u32 sprg6;
-	u32 sprg7;
-	u32 srr0;
-	u32 srr1;
-	u32 csrr0;
-	u32 csrr1;
-	u32 dsrr0;
-	u32 dsrr1;
-	u32 dear;
-	u32 esr;
+	ulong sprg0;
+	ulong sprg1;
+	ulong sprg2;
+	ulong sprg3;
+	ulong sprg4;
+	ulong sprg5;
+	ulong sprg6;
+	ulong sprg7;
+	ulong srr0;
+	ulong srr1;
+	ulong csrr0;
+	ulong csrr1;
+	ulong dsrr0;
+	ulong dsrr1;
+	ulong dear;
+	ulong esr;
 	u32 dec;
 	u32 decar;
 	u32 tbl;
@@ -117,7 +117,7 @@ struct kvm_vcpu_arch {
 	u32 tcr;
 	u32 tsr;
 	u32 ivor[16];
-	u32 ivpr;
+	ulong ivpr;
 	u32 pir;
 
 	u32 shadow_pid;
@@ -131,8 +131,8 @@ struct kvm_vcpu_arch {
 	u32 dbcr1;
 
 	u32 last_inst;
-	u32 fault_dear;
-	u32 fault_esr;
+	ulong fault_dear;
+	ulong fault_esr;
 	gpa_t paddr_accessed;
 
 	u8 io_gpr; /* GPR used as IO source/target */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 883e9db5f00c..b23cd54603f4 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -120,14 +120,14 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
 	int i;
 
-	printk("pc:   %08x msr:  %08x\n", vcpu->arch.pc, vcpu->arch.msr);
-	printk("lr:   %08x ctr:  %08x\n", vcpu->arch.lr, vcpu->arch.ctr);
-	printk("srr0: %08x srr1: %08x\n", vcpu->arch.srr0, vcpu->arch.srr1);
+	printk("pc:   %08lx msr:  %08lx\n", vcpu->arch.pc, vcpu->arch.msr);
+	printk("lr:   %08lx ctr:  %08lx\n", vcpu->arch.lr, vcpu->arch.ctr);
+	printk("srr0: %08lx srr1: %08lx\n", vcpu->arch.srr0, vcpu->arch.srr1);
 
 	printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions);
 
 	for (i = 0; i < 32; i += 4) {
-		printk("gpr%02d: %08x %08x %08x %08x\n", i,
+		printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i,
 		       vcpu->arch.gpr[i],
 		       vcpu->arch.gpr[i+1],
 		       vcpu->arch.gpr[i+2],
@@ -305,7 +305,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 		case EMULATE_FAIL:
 			/* XXX Deliver Program interrupt to guest. */
-			printk(KERN_CRIT "%s: emulation at %x failed (%08x)\n",
+			printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n",
 			       __func__, vcpu->arch.pc, vcpu->arch.last_inst);
 			/* For debugging, encode the failing instruction and
 			 * report it to userspace. */
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 30a49f8c49b2..814f1e687857 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -380,7 +380,7 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		}
 	}
 
-	KVMTRACE_3D(PPC_INSTR, vcpu, inst, vcpu->arch.pc, emulated, entryexit);
+	KVMTRACE_3D(PPC_INSTR, vcpu, inst, (int)vcpu->arch.pc, emulated, entryexit);
 
 	if (advance)
 		vcpu->arch.pc += 4; /* Advance past emulated instruction. */
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 237f3ba68d27..7ad150e0fbbf 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -256,14 +256,14 @@ int kvm_arch_vcpu_ioctl_debug_guest(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
                                      struct kvm_run *run)
 {
-	u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+	ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
 	*gpr = run->dcr.data;
 }
 
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
 {
-	u32 *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
+	ulong *gpr = &vcpu->arch.gpr[vcpu->arch.io_gpr];
 
 	if (run->mmio.len > sizeof(*gpr)) {
 		printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);

From b8fd68ac8db1f926fdb2c7f196598a279461de53 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:20 -0600
Subject: [PATCH 047/140] KVM: ppc: fix set regs to take care of msr change

When changing some msr bits e.g. problem state we need to take special
care of that. We call the function in our mtmsr emulation (not needed for
wrtee[i]), but we don't call kvmppc_set_msr if we change msr via set_regs
ioctl.
It's a corner case we never hit so far, but I assume it should be
kvmppc_set_msr in our arch set regs function (I found it because it is also
a corner case when using pv support which would miss the update otherwise).

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/booke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b23cd54603f4..dec3f50a494f 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -528,7 +528,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.ctr = regs->ctr;
 	vcpu->arch.lr = regs->lr;
 	vcpu->arch.xer = regs->xer;
-	vcpu->arch.msr = regs->msr;
+	kvmppc_set_msr(vcpu, regs->msr);
 	vcpu->arch.srr0 = regs->srr0;
 	vcpu->arch.srr1 = regs->srr1;
 	vcpu->arch.sprg0 = regs->sprg0;

From 1b6766c7f3533c5d03668e11dd5617ae4a52e5a8 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:21 -0600
Subject: [PATCH 048/140] KVM: ppc: optimize kvm stat handling

Currently we use an unnecessary if&switch to detect some cases.
To be honest we don't need the ligh_exits counter anyway, because we can
calculate it out of others. Sum_exits can also be calculated, so we can
remove that too.
MMIO, DCR  and INTR can be counted on other places without these
additional control structures (The INTR case was never hit anyway).

The handling of BOOKE_INTERRUPT_EXTERNAL/BOOKE_INTERRUPT_DECREMENTER is
similar, but we can avoid the additional if when copying 3 lines of code.
I thought about a goto there to prevent duplicate lines, but rewriting three
lines should be better style than a goto cross switch/case statements (its
also not enough code to justify a new inline function).

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/booke.c | 32 +++++++++-----------------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index dec3f50a494f..b285e3d32466 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -38,11 +38,9 @@ unsigned long kvmppc_booke_handlers;
 #define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
 
 struct kvm_stats_debugfs_item debugfs_entries[] = {
-	{ "exits",      VCPU_STAT(sum_exits) },
 	{ "mmio",       VCPU_STAT(mmio_exits) },
 	{ "dcr",        VCPU_STAT(dcr_exits) },
 	{ "sig",        VCPU_STAT(signal_exits) },
-	{ "light",      VCPU_STAT(light_exits) },
 	{ "itlb_r",     VCPU_STAT(itlb_real_miss_exits) },
 	{ "itlb_v",     VCPU_STAT(itlb_virt_miss_exits) },
 	{ "dtlb_r",     VCPU_STAT(dtlb_real_miss_exits) },
@@ -263,6 +261,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
+		vcpu->stat.ext_intr_exits++;
+		if (need_resched())
+			cond_resched();
+		r = RESUME_GUEST;
+		break;
+
 	case BOOKE_INTERRUPT_DECREMENTER:
 		/* Since we switched IVPR back to the host's value, the host
 		 * handled this interrupt the moment we enabled interrupts.
@@ -272,12 +276,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		 * we do reschedule the host will fault over it. Perhaps we
 		 * should politely restore the host's entries to minimize
 		 * misses before ceding control. */
+		vcpu->stat.dec_exits++;
 		if (need_resched())
 			cond_resched();
-		if (exit_nr == BOOKE_INTERRUPT_DECREMENTER)
-			vcpu->stat.dec_exits++;
-		else
-			vcpu->stat.ext_intr_exits++;
 		r = RESUME_GUEST;
 		break;
 
@@ -301,6 +302,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 		case EMULATE_DO_DCR:
 			run->exit_reason = KVM_EXIT_DCR;
+			vcpu->stat.dcr_exits++;
 			r = RESUME_HOST;
 			break;
 		case EMULATE_FAIL:
@@ -379,6 +381,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			/* Guest has mapped and accessed a page which is not
 			 * actually RAM. */
 			r = kvmppc_emulate_mmio(run, vcpu);
+			vcpu->stat.mmio_exits++;
 		}
 
 		break;
@@ -445,8 +448,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	kvmppc_core_deliver_interrupts(vcpu);
 
-	/* Do some exit accounting. */
-	vcpu->stat.sum_exits++;
 	if (!(r & RESUME_HOST)) {
 		/* To avoid clobbering exit_reason, only check for signals if
 		 * we aren't already exiting to userspace for some other
@@ -454,22 +455,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (signal_pending(current)) {
 			run->exit_reason = KVM_EXIT_INTR;
 			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-
 			vcpu->stat.signal_exits++;
-		} else {
-			vcpu->stat.light_exits++;
-		}
-	} else {
-		switch (run->exit_reason) {
-		case KVM_EXIT_MMIO:
-			vcpu->stat.mmio_exits++;
-			break;
-		case KVM_EXIT_DCR:
-			vcpu->stat.dcr_exits++;
-			break;
-		case KVM_EXIT_INTR:
-			vcpu->stat.signal_exits++;
-			break;
 		}
 	}
 

From 9ab80843c01ac25139e635d018467e528729a317 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:22 -0600
Subject: [PATCH 049/140] KVM: ppc: optimize find first bit

Since we use a unsigned long here anyway we can use the optimized __ffs.

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/booke.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index b285e3d32466..0f064719162c 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -222,7 +222,7 @@ void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 	unsigned int exception;
 	unsigned int priority;
 
-	priority = find_first_bit(pending, BITS_PER_BYTE * sizeof(*pending));
+	priority = __ffs(*pending);
 	while (priority <= BOOKE_MAX_INTERRUPT) {
 		exception = priority_exception[priority];
 		if (kvmppc_can_deliver_interrupt(vcpu, exception)) {

From d4cf3892e50b8e35341086a4fe2bb8a3989b55d4 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:23 -0600
Subject: [PATCH 050/140] KVM: ppc: optimize irq delivery path

In kvmppc_deliver_interrupt is just one case left in the switch and it is a
rare one (less than 8%) when looking at the exit numbers. Therefore we can
at least drop the switch/case and if an if. I inserted an unlikely too, but
that's open for discussion.

In kvmppc_can_deliver_interrupt all frequent cases are in the default case.
I know compilers are smart but we can make it easier for them. By writing
down all options and removing the default case combined with the fact that
ithe values are constants 0..15 should allow the compiler to write an easy
jump table.
Modifying kvmppc_can_deliver_interrupt pointed me to the fact that gcc seems
to be unable to reduce priority_exception[x] to a build time constant.
Therefore I changed the usage of the translation arrays in the interrupt
delivery path completely. It is now using priority without translation to irq
on the full irq delivery path.
To be able to do that ivpr regs are stored by their priority now.

Additionally the decision made in kvmppc_can_deliver_interrupt is already
sufficient to get the value of interrupt_msr_mask[x]. Therefore we can replace
the 16x4byte array used here with a single 4byte variable (might still be one
miss, but the chance to find this in cache should be better than the right
entry of the whole array).

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_ppc.h |   3 -
 arch/powerpc/kvm/44x_emulate.c     | 100 ++++++++++------
 arch/powerpc/kvm/booke.c           | 179 +++++++++--------------------
 arch/powerpc/kvm/booke.h           |  18 +++
 4 files changed, 142 insertions(+), 158 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 976ecc4b322e..844f683302f6 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -36,9 +36,6 @@ enum emulation_result {
 	EMULATE_FAIL,         /* can't emulate this instruction */
 };
 
-extern const unsigned char exception_priority[];
-extern const unsigned char priority_exception[];
-
 extern int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 extern char kvmppc_handlers_start[];
 extern unsigned long kvmppc_handler_len;
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index a634c0c4fa7e..9bc50cebf9ec 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -228,39 +228,56 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		vcpu->arch.sprg7 = vcpu->arch.gpr[rs]; break;
 
 	case SPRN_IVPR:
-		vcpu->arch.ivpr = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivpr = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR0:
-		vcpu->arch.ivor[0] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR1:
-		vcpu->arch.ivor[1] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR2:
-		vcpu->arch.ivor[2] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR3:
-		vcpu->arch.ivor[3] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR4:
-		vcpu->arch.ivor[4] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR5:
-		vcpu->arch.ivor[5] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR6:
-		vcpu->arch.ivor[6] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR7:
-		vcpu->arch.ivor[7] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR8:
-		vcpu->arch.ivor[8] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR9:
-		vcpu->arch.ivor[9] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR10:
-		vcpu->arch.ivor[10] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR11:
-		vcpu->arch.ivor[11] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR12:
-		vcpu->arch.ivor[12] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR13:
-		vcpu->arch.ivor[13] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR14:
-		vcpu->arch.ivor[14] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = vcpu->arch.gpr[rs];
+		break;
 	case SPRN_IVOR15:
-		vcpu->arch.ivor[15] = vcpu->arch.gpr[rs]; break;
+		vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = vcpu->arch.gpr[rs];
+		break;
 
 	default:
 		return EMULATE_FAIL;
@@ -295,37 +312,54 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		vcpu->arch.gpr[rt] = vcpu->arch.dbcr1; break;
 
 	case SPRN_IVOR0:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[0]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL];
+		break;
 	case SPRN_IVOR1:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[1]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK];
+		break;
 	case SPRN_IVOR2:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[2]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE];
+		break;
 	case SPRN_IVOR3:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[3]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE];
+		break;
 	case SPRN_IVOR4:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[4]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL];
+		break;
 	case SPRN_IVOR5:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[5]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT];
+		break;
 	case SPRN_IVOR6:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[6]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM];
+		break;
 	case SPRN_IVOR7:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[7]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL];
+		break;
 	case SPRN_IVOR8:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[8]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL];
+		break;
 	case SPRN_IVOR9:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[9]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL];
+		break;
 	case SPRN_IVOR10:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[10]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER];
+		break;
 	case SPRN_IVOR11:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[11]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT];
+		break;
 	case SPRN_IVOR12:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[12]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG];
+		break;
 	case SPRN_IVOR13:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[13]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS];
+		break;
 	case SPRN_IVOR14:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[14]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS];
+		break;
 	case SPRN_IVOR15:
-		vcpu->arch.gpr[rt] = vcpu->arch.ivor[15]; break;
+		vcpu->arch.gpr[rt] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG];
+		break;
+
 	default:
 		return EMULATE_FAIL;
 	}
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0f064719162c..ec59a6768ec3 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -55,64 +55,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
-static const u32 interrupt_msr_mask[16] = {
-	[BOOKE_INTERRUPT_CRITICAL]      = MSR_ME,
-	[BOOKE_INTERRUPT_MACHINE_CHECK] = 0,
-	[BOOKE_INTERRUPT_DATA_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_INST_STORAGE]  = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_EXTERNAL]      = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_ALIGNMENT]     = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_PROGRAM]       = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_FP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_SYSCALL]       = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_AP_UNAVAIL]    = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_DECREMENTER]   = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_FIT]           = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_WATCHDOG]      = MSR_ME,
-	[BOOKE_INTERRUPT_DTLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_ITLB_MISS]     = MSR_CE|MSR_ME|MSR_DE,
-	[BOOKE_INTERRUPT_DEBUG]         = MSR_ME,
-};
-
-const unsigned char exception_priority[] = {
-	[BOOKE_INTERRUPT_DATA_STORAGE] = 0,
-	[BOOKE_INTERRUPT_INST_STORAGE] = 1,
-	[BOOKE_INTERRUPT_ALIGNMENT] = 2,
-	[BOOKE_INTERRUPT_PROGRAM] = 3,
-	[BOOKE_INTERRUPT_FP_UNAVAIL] = 4,
-	[BOOKE_INTERRUPT_SYSCALL] = 5,
-	[BOOKE_INTERRUPT_AP_UNAVAIL] = 6,
-	[BOOKE_INTERRUPT_DTLB_MISS] = 7,
-	[BOOKE_INTERRUPT_ITLB_MISS] = 8,
-	[BOOKE_INTERRUPT_MACHINE_CHECK] = 9,
-	[BOOKE_INTERRUPT_DEBUG] = 10,
-	[BOOKE_INTERRUPT_CRITICAL] = 11,
-	[BOOKE_INTERRUPT_WATCHDOG] = 12,
-	[BOOKE_INTERRUPT_EXTERNAL] = 13,
-	[BOOKE_INTERRUPT_FIT] = 14,
-	[BOOKE_INTERRUPT_DECREMENTER] = 15,
-};
-
-const unsigned char priority_exception[] = {
-	BOOKE_INTERRUPT_DATA_STORAGE,
-	BOOKE_INTERRUPT_INST_STORAGE,
-	BOOKE_INTERRUPT_ALIGNMENT,
-	BOOKE_INTERRUPT_PROGRAM,
-	BOOKE_INTERRUPT_FP_UNAVAIL,
-	BOOKE_INTERRUPT_SYSCALL,
-	BOOKE_INTERRUPT_AP_UNAVAIL,
-	BOOKE_INTERRUPT_DTLB_MISS,
-	BOOKE_INTERRUPT_ITLB_MISS,
-	BOOKE_INTERRUPT_MACHINE_CHECK,
-	BOOKE_INTERRUPT_DEBUG,
-	BOOKE_INTERRUPT_CRITICAL,
-	BOOKE_INTERRUPT_WATCHDOG,
-	BOOKE_INTERRUPT_EXTERNAL,
-	BOOKE_INTERRUPT_FIT,
-	BOOKE_INTERRUPT_DECREMENTER,
-};
-
-
 /* TODO: use vcpu_printf() */
 void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 {
@@ -133,103 +75,96 @@ void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void kvmppc_booke_queue_exception(struct kvm_vcpu *vcpu, int exception)
+static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu,
+                                       unsigned int priority)
 {
-	unsigned int priority = exception_priority[exception];
 	set_bit(priority, &vcpu->arch.pending_exceptions);
 }
 
-static void kvmppc_booke_clear_exception(struct kvm_vcpu *vcpu, int exception)
-{
-	unsigned int priority = exception_priority[exception];
-	clear_bit(priority, &vcpu->arch.pending_exceptions);
-}
-
 void kvmppc_core_queue_program(struct kvm_vcpu *vcpu)
 {
-	kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 }
 
 void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu)
 {
-	kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_DECREMENTER);
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER);
 }
 
 int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu)
 {
-	unsigned int priority = exception_priority[BOOKE_INTERRUPT_DECREMENTER];
-	return test_bit(priority, &vcpu->arch.pending_exceptions);
+	return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions);
 }
 
 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
                                 struct kvm_interrupt *irq)
 {
-	kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_EXTERNAL);
+	kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_EXTERNAL);
 }
 
-/* Check if we are ready to deliver the interrupt */
-static int kvmppc_can_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
+/* Deliver the interrupt of the corresponding priority, if possible. */
+static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
+                                        unsigned int priority)
 {
-	int r;
+	int allowed = 0;
+	ulong msr_mask;
 
-	switch (interrupt) {
-	case BOOKE_INTERRUPT_CRITICAL:
-		r = vcpu->arch.msr & MSR_CE;
+	switch (priority) {
+	case BOOKE_IRQPRIO_PROGRAM:
+	case BOOKE_IRQPRIO_DTLB_MISS:
+	case BOOKE_IRQPRIO_ITLB_MISS:
+	case BOOKE_IRQPRIO_SYSCALL:
+	case BOOKE_IRQPRIO_DATA_STORAGE:
+	case BOOKE_IRQPRIO_INST_STORAGE:
+	case BOOKE_IRQPRIO_FP_UNAVAIL:
+	case BOOKE_IRQPRIO_AP_UNAVAIL:
+	case BOOKE_IRQPRIO_ALIGNMENT:
+		allowed = 1;
+		msr_mask = MSR_CE|MSR_ME|MSR_DE;
 		break;
-	case BOOKE_INTERRUPT_MACHINE_CHECK:
-		r = vcpu->arch.msr & MSR_ME;
+	case BOOKE_IRQPRIO_CRITICAL:
+	case BOOKE_IRQPRIO_WATCHDOG:
+		allowed = vcpu->arch.msr & MSR_CE;
+		msr_mask = MSR_ME;
 		break;
-	case BOOKE_INTERRUPT_EXTERNAL:
-		r = vcpu->arch.msr & MSR_EE;
+	case BOOKE_IRQPRIO_MACHINE_CHECK:
+		allowed = vcpu->arch.msr & MSR_ME;
+		msr_mask = 0;
 		break;
-	case BOOKE_INTERRUPT_DECREMENTER:
-		r = vcpu->arch.msr & MSR_EE;
+	case BOOKE_IRQPRIO_EXTERNAL:
+	case BOOKE_IRQPRIO_DECREMENTER:
+	case BOOKE_IRQPRIO_FIT:
+		allowed = vcpu->arch.msr & MSR_EE;
+		msr_mask = MSR_CE|MSR_ME|MSR_DE;
 		break;
-	case BOOKE_INTERRUPT_FIT:
-		r = vcpu->arch.msr & MSR_EE;
-		break;
-	case BOOKE_INTERRUPT_WATCHDOG:
-		r = vcpu->arch.msr & MSR_CE;
-		break;
-	case BOOKE_INTERRUPT_DEBUG:
-		r = vcpu->arch.msr & MSR_DE;
-		break;
-	default:
-		r = 1;
-	}
-
-	return r;
-}
-
-static void kvmppc_booke_deliver_interrupt(struct kvm_vcpu *vcpu, int interrupt)
-{
-	switch (interrupt) {
-	case BOOKE_INTERRUPT_DECREMENTER:
-		vcpu->arch.tsr |= TSR_DIS;
+	case BOOKE_IRQPRIO_DEBUG:
+		allowed = vcpu->arch.msr & MSR_DE;
+		msr_mask = MSR_ME;
 		break;
 	}
 
-	vcpu->arch.srr0 = vcpu->arch.pc;
-	vcpu->arch.srr1 = vcpu->arch.msr;
-	vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[interrupt];
-	kvmppc_set_msr(vcpu, vcpu->arch.msr & interrupt_msr_mask[interrupt]);
+	if (allowed) {
+		vcpu->arch.srr0 = vcpu->arch.pc;
+		vcpu->arch.srr1 = vcpu->arch.msr;
+		vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority];
+		kvmppc_set_msr(vcpu, vcpu->arch.msr & msr_mask);
+
+		clear_bit(priority, &vcpu->arch.pending_exceptions);
+	}
+
+	return allowed;
 }
 
 /* Check pending exceptions and deliver one, if possible. */
 void kvmppc_core_deliver_interrupts(struct kvm_vcpu *vcpu)
 {
 	unsigned long *pending = &vcpu->arch.pending_exceptions;
-	unsigned int exception;
 	unsigned int priority;
 
 	priority = __ffs(*pending);
 	while (priority <= BOOKE_MAX_INTERRUPT) {
-		exception = priority_exception[priority];
-		if (kvmppc_can_deliver_interrupt(vcpu, exception)) {
-			kvmppc_booke_clear_exception(vcpu, exception);
-			kvmppc_booke_deliver_interrupt(vcpu, exception);
+		if (kvmppc_booke_irqprio_deliver(vcpu, priority))
 			break;
-		}
 
 		priority = find_next_bit(pending,
 		                         BITS_PER_BYTE * sizeof(*pending),
@@ -287,7 +222,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			/* Program traps generated by user-level software must be handled
 			 * by the guest kernel. */
 			vcpu->arch.esr = vcpu->arch.fault_esr;
-			kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_PROGRAM);
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 			r = RESUME_GUEST;
 			break;
 		}
@@ -321,27 +256,27 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_FP_UNAVAIL:
-		kvmppc_booke_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_DATA_STORAGE:
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
-		kvmppc_booke_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
 		vcpu->stat.dsi_exits++;
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_INST_STORAGE:
 		vcpu->arch.esr = vcpu->arch.fault_esr;
-		kvmppc_booke_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
 		vcpu->stat.isi_exits++;
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_SYSCALL:
-		kvmppc_booke_queue_exception(vcpu, exit_nr);
+		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
 		vcpu->stat.syscall_exits++;
 		r = RESUME_GUEST;
 		break;
@@ -355,7 +290,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gtlbe = kvmppc_44x_dtlb_search(vcpu, eaddr);
 		if (!gtlbe) {
 			/* The guest didn't have a mapping for it. */
-			kvmppc_booke_queue_exception(vcpu, exit_nr);
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
 			vcpu->arch.dear = vcpu->arch.fault_dear;
 			vcpu->arch.esr = vcpu->arch.fault_esr;
 			vcpu->stat.dtlb_real_miss_exits++;
@@ -398,7 +333,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		gtlbe = kvmppc_44x_itlb_search(vcpu, eaddr);
 		if (!gtlbe) {
 			/* The guest didn't have a mapping for it. */
-			kvmppc_booke_queue_exception(vcpu, exit_nr);
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
 			vcpu->stat.itlb_real_miss_exits++;
 			break;
 		}
@@ -418,7 +353,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			               gtlbe->word2);
 		} else {
 			/* Guest mapped and leaped at non-RAM! */
-			kvmppc_booke_queue_exception(vcpu, BOOKE_INTERRUPT_MACHINE_CHECK);
+			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
 		}
 
 		break;
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index f694a4b2dafa..48d905fd60ab 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -23,6 +23,24 @@
 #include <linux/types.h>
 #include <linux/kvm_host.h>
 
+/* interrupt priortity ordering */
+#define BOOKE_IRQPRIO_DATA_STORAGE 0
+#define BOOKE_IRQPRIO_INST_STORAGE 1
+#define BOOKE_IRQPRIO_ALIGNMENT 2
+#define BOOKE_IRQPRIO_PROGRAM 3
+#define BOOKE_IRQPRIO_FP_UNAVAIL 4
+#define BOOKE_IRQPRIO_SYSCALL 5
+#define BOOKE_IRQPRIO_AP_UNAVAIL 6
+#define BOOKE_IRQPRIO_DTLB_MISS 7
+#define BOOKE_IRQPRIO_ITLB_MISS 8
+#define BOOKE_IRQPRIO_MACHINE_CHECK 9
+#define BOOKE_IRQPRIO_DEBUG 10
+#define BOOKE_IRQPRIO_CRITICAL 11
+#define BOOKE_IRQPRIO_WATCHDOG 12
+#define BOOKE_IRQPRIO_EXTERNAL 13
+#define BOOKE_IRQPRIO_FIT 14
+#define BOOKE_IRQPRIO_DECREMENTER 15
+
 /* Helper function for "full" MSR writes. No need to call this if only EE is
  * changing. */
 static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)

From fcfdbd266a41d3e41d17666de410a24995fde03a Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Wed, 5 Nov 2008 09:36:24 -0600
Subject: [PATCH 051/140] KVM: ppc: improve trap emulation

set ESR[PTR] when emulating a guest trap. This allows Linux guests to
properly handle WARN_ON() (i.e. detect that it's a non-fatal trap).

Also remove debugging printk in trap emulation.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/emulate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 814f1e687857..4c30fa0c31ea 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -74,8 +74,8 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	int advance = 1;
 
 	switch (get_op(inst)) {
-	case 3:                                                 /* trap */
-		printk("trap!\n");
+	case 3:                                             /* trap */
+		vcpu->arch.esr |= ESR_PTR;
 		kvmppc_core_queue_program(vcpu);
 		advance = 0;
 		break;

From 0853d2c1d849ef69884d2447d90d04007590b72b Mon Sep 17 00:00:00 2001
From: Nitin A Kamble <nitin.a.kamble@intel.com>
Date: Wed, 5 Nov 2008 15:37:36 -0800
Subject: [PATCH 052/140] KVM: Fix cpuid leaf 0xb loop termination

For cpuid leaf 0xb the bits 8-15 in ECX register define the end of counting
leaf.      The previous code was using bits 0-7 for this purpose, which is
a bug.

Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9a4a39cfe6ef..2889a0f359ea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1276,7 +1276,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
 		/* read more entries until level_type is zero */
 		for (i = 1; *nent < maxnent; ++i) {
-			level_type = entry[i - 1].ecx & 0xff;
+			level_type = entry[i - 1].ecx & 0xff00;
 			if (!level_type)
 				break;
 			do_cpuid_1_ent(&entry[i], function, i);

From 0fdf8e59faa5c60e9d77c8e14abe3a0f8bfcf586 Mon Sep 17 00:00:00 2001
From: Nitin A Kamble <nitin.a.kamble@intel.com>
Date: Wed, 5 Nov 2008 15:56:21 -0800
Subject: [PATCH 053/140] KVM: Fix cpuid iteration on multiple leaves per eac

The code to traverse the cpuid data array list for counting type of leaves is
currently broken.

This patches fixes the 2 things in it.

 1. Set the 1st counting entry's flag KVM_CPUID_FLAG_STATE_READ_NEXT. Without
    it the code will never find a valid entry.

 2. Also the stop condition in the for loop while looking for the next unflaged
    entry is broken. It needs to stop when it find one matching entry;
    and in the case of count of 1, it will be the same entry found in this
    iteration.

Signed-Off-By: Nitin A Kamble <nitin.a.kamble@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2889a0f359ea..7a2aeba0bfbd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1246,6 +1246,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		int t, times = entry->eax & 0xff;
 
 		entry->flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
+		entry->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;
 		for (t = 1; t < times && *nent < maxnent; ++t) {
 			do_cpuid_1_ent(&entry[t], function, 0);
 			entry[t].flags |= KVM_CPUID_FLAG_STATEFUL_FUNC;
@@ -2801,7 +2802,7 @@ static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
 
 	e->flags &= ~KVM_CPUID_FLAG_STATE_READ_NEXT;
 	/* when no next entry is found, the current entry[i] is reselected */
-	for (j = i + 1; j == i; j = (j + 1) % nent) {
+	for (j = i + 1; ; j = (j + 1) % nent) {
 		struct kvm_cpuid_entry2 *ej = &vcpu->arch.cpuid_entries[j];
 		if (ej->function == e->function) {
 			ej->flags |= KVM_CPUID_FLAG_STATE_READ_NEXT;

From 78749809222be5083e21bfe697b44ab797e5c0a8 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 7 Nov 2008 13:32:12 -0600
Subject: [PATCH 054/140] KVM: ensure that memslot userspace addresses are
 page-aligned

Bad page translation and silent guest failure ensue if the userspace address is
not page-aligned.  I hit this problem using large (host) pages with qemu,
because qemu currently has a hardcoded 4096-byte alignment for guest memory
allocations.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a65baa9039d5..0a0a9595ba3b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -715,6 +715,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		goto out;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
+	if (mem->userspace_addr & (PAGE_SIZE - 1))
+		goto out;
 	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 		goto out;
 	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)

From 74ef740da64fd82a14dbab6d7f43d798ecc1b6cc Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Fri, 7 Nov 2008 13:15:13 -0600
Subject: [PATCH 055/140] KVM: ppc: fix Kconfig constraints

Make sure that CONFIG_KVM cannot be selected without processor support
(currently, 440 is the only processor implementation available).

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/Kconfig | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 37e9b3c52a38..e4ab1c7fd925 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -15,25 +15,23 @@ menuconfig VIRTUALIZATION
 if VIRTUALIZATION
 
 config KVM
-	bool "Kernel-based Virtual Machine (KVM) support"
-	depends on EXPERIMENTAL
+	bool
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+
+config KVM_440
+	bool "KVM support for PowerPC 440 processors"
+	depends on EXPERIMENTAL && 44x
+	select KVM
 	---help---
-	  Support hosting virtualized guest machines. You will also
-	  need to select one or more of the processor modules below.
+	  Support running unmodified 440 guest kernels in virtual machines on
+	  440 host processors.
 
 	  This module provides access to the hardware capabilities through
 	  a character device node named /dev/kvm.
 
 	  If unsure, say N.
 
-config KVM_440
-	bool "KVM support for PowerPC 440 processors"
-	depends on KVM && 44x
-	---help---
-	  KVM can run unmodified 440 guest kernels on 440 host processors.
-
 config KVM_TRACE
 	bool "KVM trace support"
 	depends on KVM && MARKERS && SYSFS

From 30ed5bb685ab03c9bdf812502900b65087d61490 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 24 Oct 2008 11:47:57 +0800
Subject: [PATCH 056/140] KVM: ia64: Remove some macro definitions in
 asm-offsets.c.

Use kernel's corresponding macro instead.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/asm-offsets.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/arch/ia64/kvm/asm-offsets.c b/arch/ia64/kvm/asm-offsets.c
index 4e3dc13a619c..0c3564a7a033 100644
--- a/arch/ia64/kvm/asm-offsets.c
+++ b/arch/ia64/kvm/asm-offsets.c
@@ -24,19 +24,10 @@
 
 #include <linux/autoconf.h>
 #include <linux/kvm_host.h>
+#include <linux/kbuild.h>
 
 #include "vcpu.h"
 
-#define task_struct kvm_vcpu
-
-#define DEFINE(sym, val) \
-	asm volatile("\n->" #sym " (%0) " #val : : "i" (val))
-
-#define BLANK() asm volatile("\n->" : :)
-
-#define OFFSET(_sym, _str, _mem) \
-    DEFINE(_sym, offsetof(_str, _mem));
-
 void foo(void)
 {
 	DEFINE(VMM_TASK_SIZE, sizeof(struct kvm_vcpu));

From e7cacd40d20849f69c908f1290c714145073685a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Tue, 11 Nov 2008 15:30:40 +0800
Subject: [PATCH 057/140] KVM: Fix kernel allocated memory slot

Commit 7fd49de9773fdcb7b75e823b21c1c5dc1e218c14 "KVM: ensure that memslot
userspace addresses are page-aligned" broke kernel space allocated memory
slot, for the userspace_addr is invalid.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0a0a9595ba3b..4727c08da2e9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -715,7 +715,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		goto out;
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
-	if (mem->userspace_addr & (PAGE_SIZE - 1))
+	if (user_alloc && (mem->userspace_addr & (PAGE_SIZE - 1)))
 		goto out;
 	if (mem->slot >= KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS)
 		goto out;

From bf5d4025c9fe8a64c5905c00bf4292319d634903 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Mon, 10 Nov 2008 14:57:34 -0600
Subject: [PATCH 058/140] KVM: ppc: use MMUCR accessor to obtain TID

We have an accessor; might as well use it.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 8b65fbd6c57d..260fa8bc4608 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -339,7 +339,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 
 	switch (ws) {
 	case PPC44x_TLB_PAGEID:
-		tlbe->tid = vcpu->arch.mmucr & 0xff;
+		tlbe->tid = get_mmucr_stid(vcpu);
 		tlbe->word0 = vcpu->arch.gpr[rs];
 		break;
 

From df9b856c454e331bc394c80903fcdea19cae2a33 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Mon, 10 Nov 2008 14:57:35 -0600
Subject: [PATCH 059/140] KVM: ppc: use prefetchable mappings for guest memory

Bare metal Linux on 440 can "overmap" RAM in the kernel linear map, so that it
can use large (256MB) mappings even if memory isn't a multiple of 256MB. To
prevent the hardware prefetcher from loading from an invalid physical address
through that mapping, it's marked Guarded.

However, KVM must ensure that all guest mappings are backed by real physical
RAM (since a deliberate access through a guarded mapping could still cause a
machine check). Accordingly, we don't need to make our mappings guarded, so
let's allow prefetching as the designers intended.

Curiously this patch didn't affect performance at all on the quick test I
tried, but it's clearly the right thing to do anyways and may improve other
workloads.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/kvm/44x_tlb.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 260fa8bc4608..6fadbd696021 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -28,6 +28,8 @@
 
 #include "44x_tlb.h"
 
+#define PPC44x_TLB_UATTR_MASK \
+	(PPC44x_TLB_U0|PPC44x_TLB_U1|PPC44x_TLB_U2|PPC44x_TLB_U3)
 #define PPC44x_TLB_USER_PERM_MASK (PPC44x_TLB_UX|PPC44x_TLB_UR|PPC44x_TLB_UW)
 #define PPC44x_TLB_SUPER_PERM_MASK (PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW)
 
@@ -63,8 +65,8 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 
 static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 {
-	/* Mask off reserved bits. */
-	attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_ATTR_MASK;
+	/* We only care about the guest's permission and user bits. */
+	attrib &= PPC44x_TLB_PERM_MASK|PPC44x_TLB_UATTR_MASK;
 
 	if (!usermode) {
 		/* Guest is in supervisor mode, so we need to translate guest
@@ -76,6 +78,9 @@ static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 	/* Make sure host can always access this memory. */
 	attrib |= PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW;
 
+	/* WIMGE = 0b00100 */
+	attrib |= PPC44x_TLB_M;
+
 	return attrib;
 }
 

From fe4e771d5c37f0949047faf95d16a512b21406bf Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Mon, 10 Nov 2008 14:57:36 -0600
Subject: [PATCH 060/140] KVM: ppc: fix userspace mapping invalidation on
 context switch

We used to defer invalidating userspace TLB entries until jumping out of the
kernel. This was causing MMU weirdness most easily triggered by using a pipe in
the guest, e.g. "dmesg | tail". I believe the problem was that after the guest
kernel changed the PID (part of context switch), the old process's mappings
were still present, and so copy_to_user() on the "return to new process" path
ended up using stale mappings.

Testing with large pages (64K) exposed the problem, probably because with 4K
pages, pressure on the TLB faulted all process A's mappings out before the
guest kernel could insert any for process B.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_44x.h |  2 ++
 arch/powerpc/kvm/44x_emulate.c     |  9 +--------
 arch/powerpc/kvm/44x_tlb.c         | 31 ++++++++++++++++--------------
 3 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
index dece09350712..72e593914adb 100644
--- a/arch/powerpc/include/asm/kvm_44x.h
+++ b/arch/powerpc/include/asm/kvm_44x.h
@@ -44,4 +44,6 @@ static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct kvmppc_vcpu_44x, vcpu);
 }
 
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid);
+
 #endif /* __ASM_44X_H__ */
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 9bc50cebf9ec..9ef79c78ede9 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -21,6 +21,7 @@
 #include <asm/dcr.h>
 #include <asm/dcr-regs.h>
 #include <asm/disassemble.h>
+#include <asm/kvm_44x.h>
 
 #include "booke.h"
 #include "44x_tlb.h"
@@ -38,14 +39,6 @@
 #define XOP_ICCCI   966
 #define XOP_TLBWE   978
 
-static inline void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
-{
-	if (vcpu->arch.pid != new_pid) {
-		vcpu->arch.pid = new_pid;
-		vcpu->arch.swap_pid = 1;
-	}
-}
-
 static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.pc = vcpu->arch.srr0;
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 6fadbd696021..ee2461860bcf 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -268,31 +268,34 @@ static void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	}
 }
 
-/* Invalidate all mappings on the privilege switch after PID has been changed.
- * The guest always runs with PID=1, so we must clear the entire TLB when
- * switching address spaces. */
 void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode)
+{
+	vcpu->arch.shadow_pid = !usermode;
+}
+
+void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	int i;
 
-	if (vcpu->arch.swap_pid) {
-		/* XXX Replace loop with fancy data structures. */
-		for (i = 0; i <= tlb_44x_hwater; i++) {
-			struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+	if (unlikely(vcpu->arch.pid == new_pid))
+		return;
 
-			/* Future optimization: clear only userspace mappings. */
+	vcpu->arch.pid = new_pid;
+
+	/* Guest userspace runs with TID=0 mappings and PID=0, to make sure it
+	 * can't access guest kernel mappings (TID=1). When we switch to a new
+	 * guest PID, which will also use host PID=0, we must discard the old guest
+	 * userspace mappings. */
+	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_tlb); i++) {
+		struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+
+		if (get_tlb_tid(stlbe) == 0) {
 			kvmppc_44x_shadow_release(vcpu, i);
 			stlbe->word0 = 0;
 			kvmppc_tlbe_set_modified(vcpu, i);
-			KVMTRACE_5D(STLB_INVAL, vcpu, i,
-			            stlbe->tid, stlbe->word0, stlbe->word1,
-			            stlbe->word2, handler);
 		}
-		vcpu->arch.swap_pid = 0;
 	}
-
-	vcpu->arch.shadow_pid = !usermode;
 }
 
 static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,

From 13673a90f1cf88296f726265cc7cf3ec76ecba30 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:13 -0200
Subject: [PATCH 061/140] KVM: VMX: move vmx.h to include/asm

vmx.h will be used by core code that is independent of KVM, so I am
moving it outside the arch/x86/kvm directory.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/{kvm => include/asm}/vmx.h | 0
 arch/x86/kvm/mmu.c                  | 2 +-
 arch/x86/kvm/vmx.c                  | 2 +-
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/x86/{kvm => include/asm}/vmx.h (100%)

diff --git a/arch/x86/kvm/vmx.h b/arch/x86/include/asm/vmx.h
similarity index 100%
rename from arch/x86/kvm/vmx.h
rename to arch/x86/include/asm/vmx.h
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8904e8ada978..fa3486d64078 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -17,7 +17,6 @@
  *
  */
 
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -33,6 +32,7 @@
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/io.h>
+#include <asm/vmx.h>
 
 /*
  * When setting this variable to true it enables Two-Dimensional-Paging
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 427dbc14fae9..ec71f6464cfa 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -16,7 +16,6 @@
  */
 
 #include "irq.h"
-#include "vmx.h"
 #include "mmu.h"
 
 #include <linux/kvm_host.h>
@@ -31,6 +30,7 @@
 
 #include <asm/io.h>
 #include <asm/desc.h>
+#include <asm/vmx.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 

From c2cedf7be2017e3264c93a4c0d75b1d96d0d7104 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:14 -0200
Subject: [PATCH 062/140] KVM: SVM: move svm.h to include/asm

svm.h will be used by core code that is independent of KVM, so I am
moving it outside the arch/x86/kvm directory.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/{kvm => include/asm}/svm.h | 0
 arch/x86/kvm/kvm_svm.h              | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename arch/x86/{kvm => include/asm}/svm.h (100%)

diff --git a/arch/x86/kvm/svm.h b/arch/x86/include/asm/svm.h
similarity index 100%
rename from arch/x86/kvm/svm.h
rename to arch/x86/include/asm/svm.h
diff --git a/arch/x86/kvm/kvm_svm.h b/arch/x86/kvm/kvm_svm.h
index 65ef0fc2c036..8e5ee99551f6 100644
--- a/arch/x86/kvm/kvm_svm.h
+++ b/arch/x86/kvm/kvm_svm.h
@@ -7,7 +7,7 @@
 #include <linux/kvm_host.h>
 #include <asm/msr.h>
 
-#include "svm.h"
+#include <asm/svm.h>
 
 static const u32 host_save_user_msrs[] = {
 #ifdef CONFIG_X86_64

From eca70fc5671b226966dfb7ee9953d59199288566 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:15 -0200
Subject: [PATCH 063/140] KVM: VMX: move ASM_VMX_* definitions from
 asm/kvm_host.h to asm/vmx.h

Those definitions will be used by code outside KVM, so move it outside
of a KVM-specific source file.

Those definitions are used only on kvm/vmx.c, that already includes
asm/vmx.h, so they can be moved safely.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ------------
 arch/x86/include/asm/vmx.h      | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 99e3cc149d21..f58f7ebdea81 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -714,18 +714,6 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
 	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
 }
 
-#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
-#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
-#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
-#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
-#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
-#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
-#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
-#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
-#define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
-#define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
-
 #define MSR_IA32_TIME_STAMP_COUNTER		0x010
 
 #define TSS_IOPB_BASE_OFFSET 0x66
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 3db236c26fa4..d0238e6151d8 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -364,4 +364,19 @@ enum vmcs_field {
 
 #define VMX_EPT_IDENTITY_PAGETABLE_ADDR		0xfffbc000ul
 
+
+#define ASM_VMX_VMCLEAR_RAX       ".byte 0x66, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMLAUNCH          ".byte 0x0f, 0x01, 0xc2"
+#define ASM_VMX_VMRESUME          ".byte 0x0f, 0x01, 0xc3"
+#define ASM_VMX_VMPTRLD_RAX       ".byte 0x0f, 0xc7, 0x30"
+#define ASM_VMX_VMREAD_RDX_RAX    ".byte 0x0f, 0x78, 0xd0"
+#define ASM_VMX_VMWRITE_RAX_RDX   ".byte 0x0f, 0x79, 0xd0"
+#define ASM_VMX_VMWRITE_RSP_RDX   ".byte 0x0f, 0x79, 0xd4"
+#define ASM_VMX_VMXOFF            ".byte 0x0f, 0x01, 0xc4"
+#define ASM_VMX_VMXON_RAX         ".byte 0xf3, 0x0f, 0xc7, 0x30"
+#define ASM_VMX_INVEPT		  ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
+#define ASM_VMX_INVVPID		  ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
+
+
+
 #endif

From 6210e37b122583643da335c0389f74098713e5ca Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:16 -0200
Subject: [PATCH 064/140] KVM: VMX: move cpu_has_kvm_support() to an inline on
 asm/virtext.h

It will be used by core code on kdump and reboot, to disable
vmx if needed.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/virtext.h | 31 +++++++++++++++++++++++++++++++
 arch/x86/kvm/vmx.c             |  4 ++--
 2 files changed, 33 insertions(+), 2 deletions(-)
 create mode 100644 arch/x86/include/asm/virtext.h

diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
new file mode 100644
index 000000000000..298b6a06110d
--- /dev/null
+++ b/arch/x86/include/asm/virtext.h
@@ -0,0 +1,31 @@
+/* CPU virtualization extensions handling
+ *
+ * This should carry the code for handling CPU virtualization extensions
+ * that needs to live in the kernel core.
+ *
+ * Author: Eduardo Habkost <ehabkost@redhat.com>
+ *
+ * Copyright (C) 2008, Red Hat Inc.
+ *
+ * Contains code from KVM, Copyright (C) 2006 Qumranet, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+#ifndef _ASM_X86_VIRTEX_H
+#define _ASM_X86_VIRTEX_H
+
+#include <asm/processor.h>
+#include <asm/system.h>
+
+/*
+ * VMX functions:
+ */
+
+static inline int cpu_has_vmx(void)
+{
+	unsigned long ecx = cpuid_ecx(1);
+	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+}
+
+#endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ec71f6464cfa..defaeeb3f75c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -31,6 +31,7 @@
 #include <asm/io.h>
 #include <asm/desc.h>
 #include <asm/vmx.h>
+#include <asm/virtext.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -1044,8 +1045,7 @@ static int vmx_get_irq(struct kvm_vcpu *vcpu)
 
 static __init int cpu_has_kvm_support(void)
 {
-	unsigned long ecx = cpuid_ecx(1);
-	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
+	return cpu_has_vmx();
 }
 
 static __init int vmx_disabled_by_bios(void)

From 1e9931146c748420343aeefadb3bb17bd1c14a37 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:17 -0200
Subject: [PATCH 065/140] x86: asm/virtext.h: add cpu_vmxoff() inline function

Unfortunately we can't use exactly the same code from vmx
hardware_disable(), because the KVM function uses the
__kvm_handle_fault_on_reboot() tricks.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/virtext.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 298b6a06110d..7dee5b59930e 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -18,6 +18,8 @@
 #include <asm/processor.h>
 #include <asm/system.h>
 
+#include <asm/vmx.h>
+
 /*
  * VMX functions:
  */
@@ -28,4 +30,17 @@ static inline int cpu_has_vmx(void)
 	return test_bit(5, &ecx); /* CPUID.1:ECX.VMX[bit 5] -> VT */
 }
 
+
+/** Disable VMX on the current CPU
+ *
+ * vmxoff causes a undefined-opcode exception if vmxon was not run
+ * on the CPU previously. Only call this function if you know VMX
+ * is enabled.
+ */
+static inline void cpu_vmxoff(void)
+{
+	asm volatile (ASM_VMX_VMXOFF : : : "cc");
+	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
 #endif /* _ASM_X86_VIRTEX_H */

From 710ff4a855d0f3bf74b5b4a20328e2858a8a2968 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:18 -0200
Subject: [PATCH 066/140] KVM: VMX: extract kvm_cpu_vmxoff() from
 hardware_disable()

Along with some comments on why it is different from the core cpu_vmxoff()
function.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index defaeeb3f75c..f5958a7823f4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1091,11 +1091,20 @@ static void vmclear_local_vcpus(void)
 		__vcpu_clear(vmx);
 }
 
+
+/* Just like cpu_vmxoff(), but with the __kvm_handle_fault_on_reboot()
+ * tricks.
+ */
+static void kvm_cpu_vmxoff(void)
+{
+	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
+	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+}
+
 static void hardware_disable(void *garbage)
 {
 	vmclear_local_vcpus();
-	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
+	kvm_cpu_vmxoff();
 }
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,

From 6aa07a0d77f6aafbe69e4e8609ffaf2b7ee1b591 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:19 -0200
Subject: [PATCH 067/140] x86: cpu_emergency_vmxoff() function

Add cpu_emergency_vmxoff() and its friends: cpu_vmx_enabled() and
__cpu_emergency_vmxoff().

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/virtext.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 7dee5b59930e..6bcf0acb4ef1 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -43,4 +43,27 @@ static inline void cpu_vmxoff(void)
 	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 
+static inline int cpu_vmx_enabled(void)
+{
+	return read_cr4() & X86_CR4_VMXE;
+}
+
+/** Disable VMX if it is enabled on the current CPU
+ *
+ * You shouldn't call this if cpu_has_vmx() returns 0.
+ */
+static inline void __cpu_emergency_vmxoff(void)
+{
+	if (cpu_vmx_enabled())
+		cpu_vmxoff();
+}
+
+/** Disable VMX if it is supported and enabled on the current CPU
+ */
+static inline void cpu_emergency_vmxoff(void)
+{
+	if (cpu_has_vmx())
+		__cpu_emergency_vmxoff();
+}
+
 #endif /* _ASM_X86_VIRTEX_H */

From 63d1142f8f69e39468bc6079ab2239e902828134 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:20 -0200
Subject: [PATCH 068/140] KVM: SVM: move has_svm() code to asm/virtext.h

Use a trick to keep the printk()s on has_svm() working as before. gcc
will take care of not generating code for the 'msg' stuff when the
function is called with a NULL msg argument.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/virtext.h | 41 ++++++++++++++++++++++++++++++++++
 arch/x86/kvm/svm.c             | 19 +++++-----------
 2 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 6bcf0acb4ef1..6f0d409c3682 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -19,6 +19,7 @@
 #include <asm/system.h>
 
 #include <asm/vmx.h>
+#include <asm/svm.h>
 
 /*
  * VMX functions:
@@ -66,4 +67,44 @@ static inline void cpu_emergency_vmxoff(void)
 		__cpu_emergency_vmxoff();
 }
 
+
+
+
+/*
+ * SVM functions:
+ */
+
+/** Check if the CPU has SVM support
+ *
+ * You can use the 'msg' arg to get a message describing the problem,
+ * if the function returns zero. Simply pass NULL if you are not interested
+ * on the messages; gcc should take care of not generating code for
+ * the messages on this case.
+ */
+static inline int cpu_has_svm(const char **msg)
+{
+	uint32_t eax, ebx, ecx, edx;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+		if (msg)
+			*msg = "not amd";
+		return 0;
+	}
+
+	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+	if (eax < SVM_CPUID_FUNC) {
+		if (msg)
+			*msg = "can't execute cpuid_8000000a";
+		return 0;
+	}
+
+	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+	if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+		if (msg)
+			*msg = "svm not available";
+		return 0;
+	}
+	return 1;
+}
+
 #endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f0ad4d4217e4..0667c6d13d30 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -28,6 +28,8 @@
 
 #include <asm/desc.h>
 
+#include <asm/virtext.h>
+
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
 MODULE_AUTHOR("Qumranet");
@@ -245,24 +247,13 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 
 static int has_svm(void)
 {
-	uint32_t eax, ebx, ecx, edx;
+	const char *msg;
 
-	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-		printk(KERN_INFO "has_svm: not amd\n");
+	if (!cpu_has_svm(&msg)) {
+		printk(KERN_INFO "has_svn: %s\n", msg);
 		return 0;
 	}
 
-	cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-	if (eax < SVM_CPUID_FUNC) {
-		printk(KERN_INFO "has_svm: can't execute cpuid_8000000a\n");
-		return 0;
-	}
-
-	cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-	if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
-		printk(KERN_DEBUG "has_svm: svm not available\n");
-		return 0;
-	}
 	return 1;
 }
 

From 2c8dceebb238680d5577500f8283397d41ca5590 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:21 -0200
Subject: [PATCH 069/140] KVM: SVM: move svm_hardware_disable() code to
 asm/virtext.h

Create cpu_svm_disable() function.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/virtext.h | 14 ++++++++++++++
 arch/x86/kvm/svm.c             |  6 +-----
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 6f0d409c3682..2cfe363729c3 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -107,4 +107,18 @@ static inline int cpu_has_svm(const char **msg)
 	return 1;
 }
 
+
+/** Disable SVM on the current CPU
+ *
+ * You should call this only if cpu_has_svm() returned true.
+ */
+static inline void cpu_svm_disable(void)
+{
+	uint64_t efer;
+
+	wrmsrl(MSR_VM_HSAVE_PA, 0);
+	rdmsrl(MSR_EFER, efer);
+	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+}
+
 #endif /* _ASM_X86_VIRTEX_H */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0667c6d13d30..1452851ae258 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -259,11 +259,7 @@ static int has_svm(void)
 
 static void svm_hardware_disable(void *garbage)
 {
-	uint64_t efer;
-
-	wrmsrl(MSR_VM_HSAVE_PA, 0);
-	rdmsrl(MSR_EFER, efer);
-	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
+	cpu_svm_disable();
 }
 
 static void svm_hardware_enable(void *garbage)

From 0f3e9eeba0ea212bbea88790729d054b700ab91e Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:22 -0200
Subject: [PATCH 070/140] x86: cpu_emergency_svm_disable() function

This function can be used by the reboot or kdump code to forcibly
disable SVM on the CPU.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/virtext.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index 2cfe363729c3..593636275238 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -121,4 +121,12 @@ static inline void cpu_svm_disable(void)
 	wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
 }
 
+/** Makes sure SVM is disabled, if it is supported on the CPU
+ */
+static inline void cpu_emergency_svm_disable(void)
+{
+	if (cpu_has_svm(NULL))
+		cpu_svm_disable();
+}
+
 #endif /* _ASM_X86_VIRTEX_H */

From 2340b62f77c782c305e6ae7748675a638436d1ef Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:23 -0200
Subject: [PATCH 071/140] kdump: forcibly disable VMX and SVM on
 machine_crash_shutdown()

We need to disable virtualization extensions on all CPUs before booting
the kdump kernel, otherwise the kdump kernel booting will fail, and
rebooting after the kdump kernel did its task may also fail.

We do it using cpu_emergency_vmxoff() and cpu_emergency_svm_disable(),
that should always work, because those functions check if the CPUs
support SVM or VMX before doing their tasks.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/crash.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index d84a852e4cd7..c689d19e35ab 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -26,6 +26,7 @@
 #include <linux/kdebug.h>
 #include <asm/smp.h>
 #include <asm/reboot.h>
+#include <asm/virtext.h>
 
 #include <mach_ipi.h>
 
@@ -49,6 +50,15 @@ static void kdump_nmi_callback(int cpu, struct die_args *args)
 #endif
 	crash_save_cpu(regs, cpu);
 
+	/* Disable VMX or SVM if needed.
+	 *
+	 * We need to disable virtualization on all CPUs.
+	 * Having VMX or SVM enabled on any CPU may break rebooting
+	 * after the kdump kernel has finished its task.
+	 */
+	cpu_emergency_vmxoff();
+	cpu_emergency_svm_disable();
+
 	disable_local_APIC();
 }
 
@@ -80,6 +90,14 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 	local_irq_disable();
 
 	kdump_nmi_shootdown_cpus();
+
+	/* Booting kdump kernel with VMX or SVM enabled won't work,
+	 * because (among other limitations) we can't disable paging
+	 * with the virt flags.
+	 */
+	cpu_emergency_vmxoff();
+	cpu_emergency_svm_disable();
+
 	lapic_shutdown();
 #if defined(CONFIG_X86_IO_APIC)
 	disable_IO_APIC();

From d176720d34c72f7a8474a12204add93e54fe3ef1 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Mon, 17 Nov 2008 19:03:24 -0200
Subject: [PATCH 072/140] x86: disable VMX on all CPUs on reboot

On emergency_restart, we may need to use an NMI to disable virtualization
on all CPUs. We do that using nmi_shootdown_cpus() if VMX is enabled.

Note: With this patch, we will run the NMI stuff only when the CPU where
emergency_restart() was called has VMX enabled. This should work on most
cases because KVM enables VMX on all CPUs, but we may miss the small
window where KVM is doing that. Also, I don't know if all code using
VMX out there always enable VMX on all CPUs like KVM does. We have two
other alternatives for that:

a) Have an API that all code that enables VMX on any CPU should use
   to tell the kernel core that it is going to enable VMX on the CPUs.
b) Always call nmi_shootdown_cpus() if the CPU supports VMX. This is
   a bit intrusive and more risky, as it would run nmi_shootdown_cpus()
   on emergency_reboot() even on systems where virtualization is never
   enabled.

Finding a proper point to hook the nmi_shootdown_cpus() call isn't
trivial, as the non-emergency machine_restart() (that doesn't need the
NMI tricks) uses machine_emergency_restart() directly.

The solution to make this work without adding a new function or argument
to machine_ops was setting a 'reboot_emergency' flag that tells if
native_machine_emergency_restart() needs to do the virt cleanup or not.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/reboot.c | 62 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 61f718df6eec..72e0e4e712d6 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
 #include <asm/proto.h>
 #include <asm/reboot_fixups.h>
 #include <asm/reboot.h>
+#include <asm/virtext.h>
 
 #ifdef CONFIG_X86_32
 # include <linux/dmi.h>
@@ -39,6 +40,12 @@ int reboot_force;
 static int reboot_cpu = -1;
 #endif
 
+/* This is set if we need to go through the 'emergency' path.
+ * When machine_emergency_restart() is called, we may be on
+ * an inconsistent state and won't be able to do a clean cleanup
+ */
+static int reboot_emergency;
+
 /* This is set by the PCI code if either type 1 or type 2 PCI is detected */
 bool port_cf9_safe = false;
 
@@ -368,6 +375,48 @@ static inline void kb_wait(void)
 	}
 }
 
+static void vmxoff_nmi(int cpu, struct die_args *args)
+{
+	cpu_emergency_vmxoff();
+}
+
+/* Use NMIs as IPIs to tell all CPUs to disable virtualization
+ */
+static void emergency_vmx_disable_all(void)
+{
+	/* Just make sure we won't change CPUs while doing this */
+	local_irq_disable();
+
+	/* We need to disable VMX on all CPUs before rebooting, otherwise
+	 * we risk hanging up the machine, because the CPU ignore INIT
+	 * signals when VMX is enabled.
+	 *
+	 * We can't take any locks and we may be on an inconsistent
+	 * state, so we use NMIs as IPIs to tell the other CPUs to disable
+	 * VMX and halt.
+	 *
+	 * For safety, we will avoid running the nmi_shootdown_cpus()
+	 * stuff unnecessarily, but we don't have a way to check
+	 * if other CPUs have VMX enabled. So we will call it only if the
+	 * CPU we are running on has VMX enabled.
+	 *
+	 * We will miss cases where VMX is not enabled on all CPUs. This
+	 * shouldn't do much harm because KVM always enable VMX on all
+	 * CPUs anyway. But we can miss it on the small window where KVM
+	 * is still enabling VMX.
+	 */
+	if (cpu_has_vmx() && cpu_vmx_enabled()) {
+		/* Disable VMX on this CPU.
+		 */
+		cpu_vmxoff();
+
+		/* Halt and disable VMX on the other CPUs */
+		nmi_shootdown_cpus(vmxoff_nmi);
+
+	}
+}
+
+
 void __attribute__((weak)) mach_reboot_fixups(void)
 {
 }
@@ -376,6 +425,9 @@ static void native_machine_emergency_restart(void)
 {
 	int i;
 
+	if (reboot_emergency)
+		emergency_vmx_disable_all();
+
 	/* Tell the BIOS if we want cold or warm reboot */
 	*((unsigned short *)__va(0x472)) = reboot_mode;
 
@@ -482,13 +534,19 @@ void native_machine_shutdown(void)
 #endif
 }
 
+static void __machine_emergency_restart(int emergency)
+{
+	reboot_emergency = emergency;
+	machine_ops.emergency_restart();
+}
+
 static void native_machine_restart(char *__unused)
 {
 	printk("machine restart\n");
 
 	if (!reboot_force)
 		machine_shutdown();
-	machine_emergency_restart();
+	__machine_emergency_restart(0);
 }
 
 static void native_machine_halt(void)
@@ -532,7 +590,7 @@ void machine_shutdown(void)
 
 void machine_emergency_restart(void)
 {
-	machine_ops.emergency_restart();
+	__machine_emergency_restart(1);
 }
 
 void machine_restart(char *cmd)

From 7d637978151511148912fe2ea2bac9f9c64f5c35 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 21 Nov 2008 20:58:11 +0800
Subject: [PATCH 073/140] KVM: ia64: Define printk function for kvm-intel
 module

kvm-intel module is relocated to an isolated address space
with kernel, so it can't call host kernel's printk for debug
purpose. In the module, we implement the printk to output debug
info of vmm.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  4 ++++
 arch/ia64/kvm/Makefile           |  2 +-
 arch/ia64/kvm/kvm-ia64.c         |  8 ++++++++
 arch/ia64/kvm/kvm_lib.c          | 15 +++++++++++++++
 arch/ia64/kvm/vmm.c              | 26 ++++++++++++++++++++++++++
 5 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 arch/ia64/kvm/kvm_lib.c

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 678e2646a500..0560f3fae538 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -39,6 +39,7 @@
 #define EXIT_REASON_EXTERNAL_INTERRUPT	6
 #define EXIT_REASON_IPI			7
 #define EXIT_REASON_PTC_G		8
+#define EXIT_REASON_DEBUG		20
 
 /*Define vmm address space and vm data space.*/
 #define KVM_VMM_SIZE (__IA64_UL_CONST(16)<<20)
@@ -126,6 +127,8 @@
 			KVM_MEM_DIRTY_LOG_SIZE) / sizeof(struct kvm_vcpu_data)
 #define KVM_MAX_MEM_SIZE (KVM_P2M_SIZE >> 3 << PAGE_SHIFT)
 
+#define VMM_LOG_LEN 256
+
 #include <linux/types.h>
 #include <linux/mm.h>
 #include <linux/kvm.h>
@@ -437,6 +440,7 @@ struct kvm_vcpu_arch {
 
 	unsigned long opcode;
 	unsigned long cause;
+	char log_buf[VMM_LOG_LEN];
 	union context host;
 	union context guest;
 };
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 92cef66ca268..76464dc312e6 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -60,7 +60,7 @@ obj-$(CONFIG_KVM) += kvm.o
 
 CFLAGS_vcpu.o += -mfixed-range=f2-f5,f12-f127
 kvm-intel-objs = vmm.o vmm_ivt.o trampoline.o vcpu.o optvfault.o mmio.o \
-	vtlb.o process.o
+	vtlb.o process.o kvm_lib.o
 #Add link memcpy and memset to avoid possible structure assignment error
 kvm-intel-objs += memcpy.o memset.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 70eb829767f4..b4d24e2cce40 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -474,6 +474,13 @@ static int handle_external_interrupt(struct kvm_vcpu *vcpu,
 	return 1;
 }
 
+static int handle_vcpu_debug(struct kvm_vcpu *vcpu,
+				struct kvm_run *kvm_run)
+{
+	printk("VMM: %s", vcpu->arch.log_buf);
+	return 1;
+}
+
 static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
 		struct kvm_run *kvm_run) = {
 	[EXIT_REASON_VM_PANIC]              = handle_vm_error,
@@ -485,6 +492,7 @@ static int (*kvm_vti_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_EXTERNAL_INTERRUPT]    = handle_external_interrupt,
 	[EXIT_REASON_IPI]		    = handle_ipi,
 	[EXIT_REASON_PTC_G]		    = handle_global_purge,
+	[EXIT_REASON_DEBUG]		    = handle_vcpu_debug,
 
 };
 
diff --git a/arch/ia64/kvm/kvm_lib.c b/arch/ia64/kvm/kvm_lib.c
new file mode 100644
index 000000000000..a85cb611ecd7
--- /dev/null
+++ b/arch/ia64/kvm/kvm_lib.c
@@ -0,0 +1,15 @@
+/*
+ * kvm_lib.c: Compile some libraries for kvm-intel module.
+ *
+ *	Just include kernel's library, and disable symbols export.
+ * 	Copyright (C) 2008, Intel Corporation.
+ *  	Xiantao Zhang  (xiantao.zhang@intel.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+#undef CONFIG_MODULES
+#include "../../../lib/vsprintf.c"
+#include "../../../lib/ctype.c"
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 2275bf4e681a..957779593c2f 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -62,5 +62,31 @@ void vmm_spin_unlock(spinlock_t *lock)
 {
 	_vmm_raw_spin_unlock(lock);
 }
+
+static void vcpu_debug_exit(struct kvm_vcpu *vcpu)
+{
+	struct exit_ctl_data *p = &vcpu->arch.exit_data;
+	long psr;
+
+	local_irq_save(psr);
+	p->exit_reason = EXIT_REASON_DEBUG;
+	vmm_transition(vcpu);
+	local_irq_restore(psr);
+}
+
+asmlinkage int printk(const char *fmt, ...)
+{
+	struct kvm_vcpu *vcpu = current_vcpu;
+	va_list args;
+	int r;
+
+	memset(vcpu->arch.log_buf, 0, VMM_LOG_LEN);
+	va_start(args, fmt);
+	r = vsnprintf(vcpu->arch.log_buf, VMM_LOG_LEN, fmt, args);
+	va_end(args);
+	vcpu_debug_exit(vcpu);
+	return r;
+}
+
 module_init(kvm_vmm_init)
 module_exit(kvm_vmm_exit)

From 5e2be19832ccf93bf731a1758ec9fabf48414584 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 21 Nov 2008 10:46:12 +0800
Subject: [PATCH 074/140] KVM: ia64: Add some debug points to provide crash
 infomation

Use printk infrastructure to print out some debug info once VM crashes.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/mmio.c    | 38 +++++++++--------------
 arch/ia64/kvm/process.c |  9 +++---
 arch/ia64/kvm/vcpu.c    | 69 ++++++++++++++++++++++++++++++++++++++---
 arch/ia64/kvm/vcpu.h    |  2 +-
 arch/ia64/kvm/vmm.c     |  1 +
 5 files changed, 87 insertions(+), 32 deletions(-)

diff --git a/arch/ia64/kvm/mmio.c b/arch/ia64/kvm/mmio.c
index 7f1a858bc69f..21f63fffc379 100644
--- a/arch/ia64/kvm/mmio.c
+++ b/arch/ia64/kvm/mmio.c
@@ -66,31 +66,25 @@ void lsapic_write(struct kvm_vcpu *v, unsigned long addr,
 
 	switch (addr) {
 	case PIB_OFST_INTA:
-		/*panic_domain(NULL, "Undefined write on PIB INTA\n");*/
-		panic_vm(v);
+		panic_vm(v, "Undefined write on PIB INTA\n");
 		break;
 	case PIB_OFST_XTP:
 		if (length == 1) {
 			vlsapic_write_xtp(v, val);
 		} else {
-			/*panic_domain(NULL,
-			"Undefined write on PIB XTP\n");*/
-			panic_vm(v);
+			panic_vm(v, "Undefined write on PIB XTP\n");
 		}
 		break;
 	default:
 		if (PIB_LOW_HALF(addr)) {
-			/*lower half */
+			/*Lower half */
 			if (length != 8)
-				/*panic_domain(NULL,
-				"Can't LHF write with size %ld!\n",
-				length);*/
-				panic_vm(v);
+				panic_vm(v, "Can't LHF write with size %ld!\n",
+						length);
 			else
 				vlsapic_write_ipi(v, addr, val);
-		} else {   /*	upper half
-				printk("IPI-UHF write %lx\n",addr);*/
-			panic_vm(v);
+		} else {   /*Upper half */
+			panic_vm(v, "IPI-UHF write %lx\n", addr);
 		}
 		break;
 	}
@@ -108,22 +102,18 @@ unsigned long lsapic_read(struct kvm_vcpu *v, unsigned long addr,
 		if (length == 1) /* 1 byte load */
 			; /* There is no i8259, there is no INTA access*/
 		else
-			/*panic_domain(NULL,"Undefined read on PIB INTA\n"); */
-			panic_vm(v);
+			panic_vm(v, "Undefined read on PIB INTA\n");
 
 		break;
 	case PIB_OFST_XTP:
 		if (length == 1) {
 			result = VLSAPIC_XTP(v);
-			/* printk("read xtp %lx\n", result); */
 		} else {
-			/*panic_domain(NULL,
-			"Undefined read on PIB XTP\n");*/
-			panic_vm(v);
+			panic_vm(v, "Undefined read on PIB XTP\n");
 		}
 		break;
 	default:
-		panic_vm(v);
+		panic_vm(v, "Undefined addr access for lsapic!\n");
 		break;
 	}
 	return result;
@@ -162,7 +152,7 @@ static void mmio_access(struct kvm_vcpu *vcpu, u64 src_pa, u64 *dest,
 			/* it's necessary to ensure zero extending */
 			*dest = p->u.ioreq.data & (~0UL >> (64-(s*8)));
 	} else
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Unhandled mmio access returned!\n");
 out:
 	local_irq_restore(psr);
 	return ;
@@ -324,7 +314,9 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
 		return;
 	} else {
 		inst_type = -1;
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Unsupported MMIO access instruction! \
+				Bunld[0]=0x%lx, Bundle[1]=0x%lx\n",
+				bundle.i64[0], bundle.i64[1]);
 	}
 
 	size = 1 << size;
@@ -335,7 +327,7 @@ void emulate_io_inst(struct kvm_vcpu *vcpu, u64 padr, u64 ma)
 		if (inst_type == SL_INTEGER)
 			vcpu_set_gr(vcpu, inst.M1.r1, data, 0);
 		else
-			panic_vm(vcpu);
+			panic_vm(vcpu, "Unsupported instruction type!\n");
 
 	}
 	vcpu_increment_iip(vcpu);
diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index 800817307b7b..cefc349ce354 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -527,7 +527,8 @@ void reflect_interruption(u64 ifa, u64 isr, u64 iim,
 	vector = vec2off[vec];
 
 	if (!(vpsr & IA64_PSR_IC) && (vector != IA64_DATA_NESTED_TLB_VECTOR)) {
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Interruption with vector :0x%lx occurs "
+						"with psr.ic = 0\n", vector);
 		return;
 	}
 
@@ -586,7 +587,7 @@ static void set_pal_call_result(struct kvm_vcpu *vcpu)
 		vcpu_set_gr(vcpu, 10, p->u.pal_data.ret.v1, 0);
 		vcpu_set_gr(vcpu, 11, p->u.pal_data.ret.v2, 0);
 	} else
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Mis-set for exit reason!\n");
 }
 
 static void set_sal_call_data(struct kvm_vcpu *vcpu)
@@ -614,7 +615,7 @@ static void set_sal_call_result(struct kvm_vcpu *vcpu)
 		vcpu_set_gr(vcpu, 10, p->u.sal_data.ret.r10, 0);
 		vcpu_set_gr(vcpu, 11, p->u.sal_data.ret.r11, 0);
 	} else
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Mis-set for exit reason!\n");
 }
 
 void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
@@ -680,7 +681,7 @@ static void generate_exirq(struct kvm_vcpu *vcpu)
 	vpsr = VCPU(vcpu, vpsr);
 	isr = vpsr & IA64_PSR_RI;
 	if (!(vpsr & IA64_PSR_IC))
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Trying to inject one IRQ with psr.ic=0\n");
 	reflect_interruption(0, isr, 0, 12, regs); /* EXT IRQ */
 }
 
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index a528d70a820c..ecd526b55323 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -1651,7 +1651,8 @@ void vcpu_set_psr(struct kvm_vcpu *vcpu, unsigned long val)
 	 * Otherwise panic
 	 */
 	if (val & (IA64_PSR_PK | IA64_PSR_IS | IA64_PSR_VM))
-		panic_vm(vcpu);
+		panic_vm(vcpu, "Only support guests with vpsr.pk =0 \
+				& vpsr.is=0\n");
 
 	/*
 	 * For those IA64_PSR bits: id/da/dd/ss/ed/ia
@@ -2104,7 +2105,7 @@ void kvm_init_all_rr(struct kvm_vcpu *vcpu)
 
 	if (is_physical_mode(vcpu)) {
 		if (vcpu->arch.mode_flags & GUEST_PHY_EMUL)
-			panic_vm(vcpu);
+			panic_vm(vcpu, "Machine Status conflicts!\n");
 
 		ia64_set_rr((VRN0 << VRN_SHIFT), vcpu->arch.metaphysical_rr0);
 		ia64_dv_serialize_data();
@@ -2153,10 +2154,70 @@ int vmm_entry(void)
 	return 0;
 }
 
-void panic_vm(struct kvm_vcpu *v)
+static void kvm_show_registers(struct kvm_pt_regs *regs)
 {
-	struct exit_ctl_data *p = &v->arch.exit_data;
+	unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
 
+	struct kvm_vcpu *vcpu = current_vcpu;
+	if (vcpu != NULL)
+		printk("vcpu 0x%p vcpu %d\n",
+		       vcpu, vcpu->vcpu_id);
+
+	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]\n",
+	       regs->cr_ipsr, regs->cr_ifs, ip);
+
+	printk("unat: %016lx pfs : %016lx rsc : %016lx\n",
+	       regs->ar_unat, regs->ar_pfs, regs->ar_rsc);
+	printk("rnat: %016lx bspstore: %016lx pr  : %016lx\n",
+	       regs->ar_rnat, regs->ar_bspstore, regs->pr);
+	printk("ldrs: %016lx ccv : %016lx fpsr: %016lx\n",
+	       regs->loadrs, regs->ar_ccv, regs->ar_fpsr);
+	printk("csd : %016lx ssd : %016lx\n", regs->ar_csd, regs->ar_ssd);
+	printk("b0  : %016lx b6  : %016lx b7  : %016lx\n", regs->b0,
+							regs->b6, regs->b7);
+	printk("f6  : %05lx%016lx f7  : %05lx%016lx\n",
+	       regs->f6.u.bits[1], regs->f6.u.bits[0],
+	       regs->f7.u.bits[1], regs->f7.u.bits[0]);
+	printk("f8  : %05lx%016lx f9  : %05lx%016lx\n",
+	       regs->f8.u.bits[1], regs->f8.u.bits[0],
+	       regs->f9.u.bits[1], regs->f9.u.bits[0]);
+	printk("f10 : %05lx%016lx f11 : %05lx%016lx\n",
+	       regs->f10.u.bits[1], regs->f10.u.bits[0],
+	       regs->f11.u.bits[1], regs->f11.u.bits[0]);
+
+	printk("r1  : %016lx r2  : %016lx r3  : %016lx\n", regs->r1,
+							regs->r2, regs->r3);
+	printk("r8  : %016lx r9  : %016lx r10 : %016lx\n", regs->r8,
+							regs->r9, regs->r10);
+	printk("r11 : %016lx r12 : %016lx r13 : %016lx\n", regs->r11,
+							regs->r12, regs->r13);
+	printk("r14 : %016lx r15 : %016lx r16 : %016lx\n", regs->r14,
+							regs->r15, regs->r16);
+	printk("r17 : %016lx r18 : %016lx r19 : %016lx\n", regs->r17,
+							regs->r18, regs->r19);
+	printk("r20 : %016lx r21 : %016lx r22 : %016lx\n", regs->r20,
+							regs->r21, regs->r22);
+	printk("r23 : %016lx r24 : %016lx r25 : %016lx\n", regs->r23,
+							regs->r24, regs->r25);
+	printk("r26 : %016lx r27 : %016lx r28 : %016lx\n", regs->r26,
+							regs->r27, regs->r28);
+	printk("r29 : %016lx r30 : %016lx r31 : %016lx\n", regs->r29,
+							regs->r30, regs->r31);
+
+}
+
+void panic_vm(struct kvm_vcpu *v, const char *fmt, ...)
+{
+	va_list args;
+	char buf[256];
+
+	struct kvm_pt_regs *regs = vcpu_regs(v);
+	struct exit_ctl_data *p = &v->arch.exit_data;
+	va_start(args, fmt);
+	vsnprintf(buf, sizeof(buf), fmt, args);
+	va_end(args);
+	printk(buf);
+	kvm_show_registers(regs);
 	p->exit_reason = EXIT_REASON_VM_PANIC;
 	vmm_transition(v);
 	/*Never to return*/
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index e9b2a4e121c0..0dad8421791c 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -737,7 +737,7 @@ void kvm_init_vtlb(struct kvm_vcpu *v);
 void kvm_init_vhpt(struct kvm_vcpu *v);
 void thash_init(struct thash_cb *hcb, u64 sz);
 
-void panic_vm(struct kvm_vcpu *v);
+void panic_vm(struct kvm_vcpu *v, const char *fmt, ...);
 
 extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
 		u64 arg4, u64 arg5, u64 arg6, u64 arg7);
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 957779593c2f..d3dc0b040ce4 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -20,6 +20,7 @@
  */
 
 
+#include<linux/kernel.h>
 #include<linux/module.h>
 #include<asm/fpswa.h>
 

From 9f7d5bb5e2abf5316bb17eb3e7751dbafa09e5cf Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 21 Nov 2008 17:16:07 +0800
Subject: [PATCH 075/140] KVM: ia64: Add handler for crashed vmm

Since vmm runs in an isolated address space and it is just a copy
of host's kvm-intel module, so once vmm crashes, we just crash all guests
running on it instead of crashing whole kernel.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/process.c | 20 ++++++++++++++++++++
 arch/ia64/kvm/vcpu.h    |  3 +++
 arch/ia64/kvm/vmm.c     |  2 ++
 arch/ia64/kvm/vmm_ivt.S | 31 +++++++++++++++++++------------
 4 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index cefc349ce354..552d07724207 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -942,8 +942,20 @@ static void vcpu_do_resume(struct kvm_vcpu *vcpu)
 	ia64_set_pta(vcpu->arch.vhpt.pta.val);
 }
 
+static void vmm_sanity_check(struct kvm_vcpu *vcpu)
+{
+	struct exit_ctl_data *p = &vcpu->arch.exit_data;
+
+	if (!vmm_sanity && p->exit_reason != EXIT_REASON_DEBUG) {
+		panic_vm(vcpu, "Failed to do vmm sanity check,"
+			"it maybe caused by crashed vmm!!\n\n");
+	}
+}
+
 static void kvm_do_resume_op(struct kvm_vcpu *vcpu)
 {
+	vmm_sanity_check(vcpu); /*Guarantee vcpu runing on healthy vmm!*/
+
 	if (test_and_clear_bit(KVM_REQ_RESUME, &vcpu->requests)) {
 		vcpu_do_resume(vcpu);
 		return;
@@ -969,3 +981,11 @@ void vmm_transition(struct kvm_vcpu *vcpu)
 						1, 0, 0, 0, 0, 0);
 	kvm_do_resume_op(vcpu);
 }
+
+void vmm_panic_handler(u64 vec)
+{
+	struct kvm_vcpu *vcpu = current_vcpu;
+	vmm_sanity = 0;
+	panic_vm(vcpu, "Unexpected interruption occurs in VMM, vector:0x%lx\n",
+			vec2off[vec]);
+}
diff --git a/arch/ia64/kvm/vcpu.h b/arch/ia64/kvm/vcpu.h
index 0dad8421791c..b2f12a562bdf 100644
--- a/arch/ia64/kvm/vcpu.h
+++ b/arch/ia64/kvm/vcpu.h
@@ -741,5 +741,8 @@ void panic_vm(struct kvm_vcpu *v, const char *fmt, ...);
 
 extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2, u64 arg3,
 		u64 arg4, u64 arg5, u64 arg6, u64 arg7);
+
+extern long vmm_sanity;
+
 #endif
 #endif	/* __VCPU_H__ */
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index d3dc0b040ce4..9eee5c04bacc 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -32,6 +32,8 @@ MODULE_LICENSE("GPL");
 extern char kvm_ia64_ivt;
 extern fpswa_interface_t *vmm_fpswa_interface;
 
+long vmm_sanity = 1;
+
 struct kvm_vmm_info vmm_info = {
 	.module	     = THIS_MODULE,
 	.vmm_entry   = vmm_entry,
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index c1d7251a1480..50b464628536 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -70,14 +70,12 @@
 # define PSR_DEFAULT_BITS   0
 #endif
 
-
 #define KVM_FAULT(n)    \
     kvm_fault_##n:;          \
     mov r19=n;;          \
-    br.sptk.many kvm_fault_##n;         \
+    br.sptk.many kvm_vmm_panic;         \
     ;;                  \
 
-
 #define KVM_REFLECT(n)    \
     mov r31=pr;           \
     mov r19=n;       /* prepare to save predicates */ \
@@ -85,17 +83,26 @@
     ;;      \
     tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
 (p7)br.sptk.many kvm_dispatch_reflection;        \
-    br.sptk.many kvm_panic;      \
+    br.sptk.many kvm_vmm_panic;      \
 
-
-GLOBAL_ENTRY(kvm_panic)
-    br.sptk.many kvm_panic
+GLOBAL_ENTRY(kvm_vmm_panic)
+    KVM_SAVE_MIN_WITH_COVER_R19
+    alloc r14=ar.pfs,0,0,1,0
+    mov out0=r15
+    adds r3=8,r2                // set up second base pointer
     ;;
-END(kvm_panic)
-
-
-
-
+    ssm psr.ic
+    ;;
+    srlz.i                  // guarantee that interruption collection is on
+    ;;
+    //(p15) ssm psr.i               // restore psr.i
+    addl r14=@gprel(ia64_leave_hypervisor),gp
+    ;;
+    KVM_SAVE_REST
+    mov rp=r14
+    ;;
+    br.call.sptk.many b6=vmm_panic_handler;
+END(kvm_vmm_panic)
 
     .section .text.ivt,"ax"
 

From 8fe0736763a07fbea56213ea105a0c2ee098e6fc Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Fri, 21 Nov 2008 21:04:37 +0800
Subject: [PATCH 076/140] KVM: ia64: Clean up vmm_ivt.S using tab to indent
 every line

Using tab for indentation for vmm_ivt.S.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/vmm_ivt.S | 1468 +++++++++++++++++++--------------------
 1 file changed, 728 insertions(+), 740 deletions(-)

diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index 50b464628536..3ef1a017a318 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -1,5 +1,5 @@
 /*
- * /ia64/kvm_ivt.S
+ * arch/ia64/kvm/vmm_ivt.S
  *
  * Copyright (C) 1998-2001, 2003 Hewlett-Packard Co
  *      Stephane Eranian <eranian@hpl.hp.com>
@@ -71,37 +71,37 @@
 #endif
 
 #define KVM_FAULT(n)    \
-    kvm_fault_##n:;          \
-    mov r19=n;;          \
-    br.sptk.many kvm_vmm_panic;         \
-    ;;                  \
+	kvm_fault_##n:;          \
+	mov r19=n;;          \
+	br.sptk.many kvm_vmm_panic;         \
+	;;                  \
 
 #define KVM_REFLECT(n)    \
-    mov r31=pr;           \
-    mov r19=n;       /* prepare to save predicates */ \
-    mov r29=cr.ipsr;      \
-    ;;      \
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
-(p7)br.sptk.many kvm_dispatch_reflection;        \
-    br.sptk.many kvm_vmm_panic;      \
+	mov r31=pr;           \
+	mov r19=n;       /* prepare to save predicates */ \
+	mov r29=cr.ipsr;      \
+	;;      \
+	tbit.z p6,p7=r29,IA64_PSR_VM_BIT;       \
+(p7)	br.sptk.many kvm_dispatch_reflection;        \
+	br.sptk.many kvm_vmm_panic;      \
 
 GLOBAL_ENTRY(kvm_vmm_panic)
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,1,0
-    mov out0=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    br.call.sptk.many b6=vmm_panic_handler;
+	KVM_SAVE_MIN_WITH_COVER_R19
+	alloc r14=ar.pfs,0,0,1,0
+	mov out0=r15
+	adds r3=8,r2                // set up second base pointer
+	;;
+	ssm psr.ic
+	;;
+	srlz.i    // guarantee that interruption collection is on
+	;;
+	//(p15) ssm psr.i               // restore psr.i
+	addl r14=@gprel(ia64_leave_hypervisor),gp
+	;;
+	KVM_SAVE_REST
+	mov rp=r14
+	;;
+	br.call.sptk.many b6=vmm_panic_handler;
 END(kvm_vmm_panic)
 
     .section .text.ivt,"ax"
@@ -112,308 +112,307 @@ kvm_ia64_ivt:
 ///////////////////////////////////////////////////////////////
 // 0x0000 Entry 0 (size 64 bundles) VHPT Translation (8,20,47)
 ENTRY(kvm_vhpt_miss)
-    KVM_FAULT(0)
+	KVM_FAULT(0)
 END(kvm_vhpt_miss)
 
-
     .org kvm_ia64_ivt+0x400
 ////////////////////////////////////////////////////////////////
 // 0x0400 Entry 1 (size 64 bundles) ITLB (21)
 ENTRY(kvm_itlb_miss)
-    mov r31 = pr
-    mov r29=cr.ipsr;
-    ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
-    (p6) br.sptk kvm_alt_itlb_miss
-    mov r19 = 1
-    br.sptk kvm_itlb_miss_dispatch
-    KVM_FAULT(1);
+	mov r31 = pr
+	mov r29=cr.ipsr;
+	;;
+	tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+(p6)	br.sptk kvm_alt_itlb_miss
+	mov r19 = 1
+	br.sptk kvm_itlb_miss_dispatch
+	KVM_FAULT(1);
 END(kvm_itlb_miss)
 
     .org kvm_ia64_ivt+0x0800
 //////////////////////////////////////////////////////////////////
 // 0x0800 Entry 2 (size 64 bundles) DTLB (9,48)
 ENTRY(kvm_dtlb_miss)
-    mov r31 = pr
-    mov r29=cr.ipsr;
-    ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
-(p6)br.sptk kvm_alt_dtlb_miss
-    br.sptk kvm_dtlb_miss_dispatch
+	mov r31 = pr
+	mov r29=cr.ipsr;
+	;;
+	tbit.z p6,p7=r29,IA64_PSR_VM_BIT;
+(p6)	br.sptk kvm_alt_dtlb_miss
+	br.sptk kvm_dtlb_miss_dispatch
 END(kvm_dtlb_miss)
 
      .org kvm_ia64_ivt+0x0c00
 ////////////////////////////////////////////////////////////////////
 // 0x0c00 Entry 3 (size 64 bundles) Alt ITLB (19)
 ENTRY(kvm_alt_itlb_miss)
-    mov r16=cr.ifa    // get address that caused the TLB miss
-    ;;
-    movl r17=PAGE_KERNEL
-    mov r24=cr.ipsr
-    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-    ;;
-    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    ;;
-    or r19=r17,r19      // insert PTE control bits into r19
-    ;;
-    movl r20=IA64_GRANULE_SHIFT<<2
-    ;;
-    mov cr.itir=r20
-    ;;
-    itc.i r19		// insert the TLB entry
-    mov pr=r31,-1
-    rfi
+	mov r16=cr.ifa    // get address that caused the TLB miss
+	;;
+	movl r17=PAGE_KERNEL
+	mov r24=cr.ipsr
+	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+	;;
+	and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+	;;
+	or r19=r17,r19      // insert PTE control bits into r19
+	;;
+	movl r20=IA64_GRANULE_SHIFT<<2
+	;;
+	mov cr.itir=r20
+	;;
+	itc.i r19		// insert the TLB entry
+	mov pr=r31,-1
+	rfi
 END(kvm_alt_itlb_miss)
 
     .org kvm_ia64_ivt+0x1000
 /////////////////////////////////////////////////////////////////////
 // 0x1000 Entry 4 (size 64 bundles) Alt DTLB (7,46)
 ENTRY(kvm_alt_dtlb_miss)
-    mov r16=cr.ifa		// get address that caused the TLB miss
-    ;;
-    movl r17=PAGE_KERNEL
-    movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
-    mov r24=cr.ipsr
-    ;;
-    and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
-    ;;
-    or r19=r19,r17	// insert PTE control bits into r19
-    ;;
-    movl r20=IA64_GRANULE_SHIFT<<2
-    ;;
-    mov cr.itir=r20
-    ;;
-    itc.d r19		// insert the TLB entry
-    mov pr=r31,-1
-    rfi
+	mov r16=cr.ifa		// get address that caused the TLB miss
+	;;
+	movl r17=PAGE_KERNEL
+	movl r19=(((1 << IA64_MAX_PHYS_BITS) - 1) & ~0xfff)
+	mov r24=cr.ipsr
+	;;
+	and r19=r19,r16     // clear ed, reserved bits, and PTE control bits
+	;;
+	or r19=r19,r17	// insert PTE control bits into r19
+	;;
+	movl r20=IA64_GRANULE_SHIFT<<2
+	;;
+	mov cr.itir=r20
+	;;
+	itc.d r19		// insert the TLB entry
+	mov pr=r31,-1
+	rfi
 END(kvm_alt_dtlb_miss)
 
     .org kvm_ia64_ivt+0x1400
 //////////////////////////////////////////////////////////////////////
 // 0x1400 Entry 5 (size 64 bundles) Data nested TLB (6,45)
 ENTRY(kvm_nested_dtlb_miss)
-    KVM_FAULT(5)
+	KVM_FAULT(5)
 END(kvm_nested_dtlb_miss)
 
     .org kvm_ia64_ivt+0x1800
 /////////////////////////////////////////////////////////////////////
 // 0x1800 Entry 6 (size 64 bundles) Instruction Key Miss (24)
 ENTRY(kvm_ikey_miss)
-    KVM_REFLECT(6)
+	KVM_REFLECT(6)
 END(kvm_ikey_miss)
 
     .org kvm_ia64_ivt+0x1c00
 /////////////////////////////////////////////////////////////////////
 // 0x1c00 Entry 7 (size 64 bundles) Data Key Miss (12,51)
 ENTRY(kvm_dkey_miss)
-    KVM_REFLECT(7)
+	KVM_REFLECT(7)
 END(kvm_dkey_miss)
 
     .org kvm_ia64_ivt+0x2000
 ////////////////////////////////////////////////////////////////////
 // 0x2000 Entry 8 (size 64 bundles) Dirty-bit (54)
 ENTRY(kvm_dirty_bit)
-    KVM_REFLECT(8)
+	KVM_REFLECT(8)
 END(kvm_dirty_bit)
 
     .org kvm_ia64_ivt+0x2400
 ////////////////////////////////////////////////////////////////////
 // 0x2400 Entry 9 (size 64 bundles) Instruction Access-bit (27)
 ENTRY(kvm_iaccess_bit)
-    KVM_REFLECT(9)
+	KVM_REFLECT(9)
 END(kvm_iaccess_bit)
 
     .org kvm_ia64_ivt+0x2800
 ///////////////////////////////////////////////////////////////////
 // 0x2800 Entry 10 (size 64 bundles) Data Access-bit (15,55)
 ENTRY(kvm_daccess_bit)
-    KVM_REFLECT(10)
+	KVM_REFLECT(10)
 END(kvm_daccess_bit)
 
     .org kvm_ia64_ivt+0x2c00
 /////////////////////////////////////////////////////////////////
 // 0x2c00 Entry 11 (size 64 bundles) Break instruction (33)
 ENTRY(kvm_break_fault)
-    mov r31=pr
-    mov r19=11
-    mov r29=cr.ipsr
-    ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
-    ;;
-    alloc r14=ar.pfs,0,0,4,0 // now it's safe (must be first in insn group!)
-    mov out0=cr.ifa
-    mov out2=cr.isr     // FIXME: pity to make this slow access twice
-    mov out3=cr.iim     // FIXME: pity to make this slow access twice
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15)ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    adds out1=16,sp
-    br.call.sptk.many b6=kvm_ia64_handle_break
-    ;;
+	mov r31=pr
+	mov r19=11
+	mov r29=cr.ipsr
+	;;
+	KVM_SAVE_MIN_WITH_COVER_R19
+	;;
+	alloc r14=ar.pfs,0,0,4,0 //(must be first in insn group!)
+	mov out0=cr.ifa
+	mov out2=cr.isr     // FIXME: pity to make this slow access twice
+	mov out3=cr.iim     // FIXME: pity to make this slow access twice
+	adds r3=8,r2                // set up second base pointer
+	;;
+	ssm psr.ic
+	;;
+	srlz.i         // guarantee that interruption collection is on
+	;;
+	//(p15)ssm psr.i               // restore psr.i
+	addl r14=@gprel(ia64_leave_hypervisor),gp
+	;;
+	KVM_SAVE_REST
+	mov rp=r14
+	;;
+	adds out1=16,sp
+	br.call.sptk.many b6=kvm_ia64_handle_break
+	;;
 END(kvm_break_fault)
 
     .org kvm_ia64_ivt+0x3000
 /////////////////////////////////////////////////////////////////
 // 0x3000 Entry 12 (size 64 bundles) External Interrupt (4)
 ENTRY(kvm_interrupt)
-    mov r31=pr		// prepare to save predicates
-    mov r19=12
-    mov r29=cr.ipsr
-    ;;
-    tbit.z p6,p7=r29,IA64_PSR_VM_BIT
-    tbit.z p0,p15=r29,IA64_PSR_I_BIT
-    ;;
-(p7) br.sptk kvm_dispatch_interrupt
-    ;;
-    mov r27=ar.rsc		/* M */
-    mov r20=r1			/* A */
-    mov r25=ar.unat		/* M */
-    mov r26=ar.pfs		/* I */
-    mov r28=cr.iip		/* M */
-    cover			/* B (or nothing) */
-    ;;
-    mov r1=sp
-    ;;
-    invala			/* M */
-    mov r30=cr.ifs
-    ;;
-    addl r1=-VMM_PT_REGS_SIZE,r1
-    ;;
-    adds r17=2*L1_CACHE_BYTES,r1	/* really: biggest cache-line size */
-    adds r16=PT(CR_IPSR),r1
-    ;;
-    lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
-    st8 [r16]=r29			/* save cr.ipsr */
-    ;;
-    lfetch.fault.excl.nt1 [r17]
-    mov r29=b0
-    ;;
-    adds r16=PT(R8),r1  	/* initialize first base pointer */
-    adds r17=PT(R9),r1  	/* initialize second base pointer */
-    mov r18=r0      		/* make sure r18 isn't NaT */
-    ;;
+	mov r31=pr		// prepare to save predicates
+	mov r19=12
+	mov r29=cr.ipsr
+	;;
+	tbit.z p6,p7=r29,IA64_PSR_VM_BIT
+	tbit.z p0,p15=r29,IA64_PSR_I_BIT
+	;;
+(p7)	br.sptk kvm_dispatch_interrupt
+	;;
+	mov r27=ar.rsc		/* M */
+	mov r20=r1			/* A */
+	mov r25=ar.unat		/* M */
+	mov r26=ar.pfs		/* I */
+	mov r28=cr.iip		/* M */
+	cover			/* B (or nothing) */
+	;;
+	mov r1=sp
+	;;
+	invala			/* M */
+	mov r30=cr.ifs
+	;;
+	addl r1=-VMM_PT_REGS_SIZE,r1
+	;;
+	adds r17=2*L1_CACHE_BYTES,r1	/* really: biggest cache-line size */
+	adds r16=PT(CR_IPSR),r1
+	;;
+	lfetch.fault.excl.nt1 [r17],L1_CACHE_BYTES
+	st8 [r16]=r29			/* save cr.ipsr */
+	;;
+	lfetch.fault.excl.nt1 [r17]
+	mov r29=b0
+	;;
+	adds r16=PT(R8),r1  	/* initialize first base pointer */
+	adds r17=PT(R9),r1  	/* initialize second base pointer */
+	mov r18=r0      		/* make sure r18 isn't NaT */
+	;;
 .mem.offset 0,0; st8.spill [r16]=r8,16
 .mem.offset 8,0; st8.spill [r17]=r9,16
         ;;
 .mem.offset 0,0; st8.spill [r16]=r10,24
 .mem.offset 8,0; st8.spill [r17]=r11,24
         ;;
-    st8 [r16]=r28,16		/* save cr.iip */
-    st8 [r17]=r30,16		/* save cr.ifs */
-    mov r8=ar.fpsr		/* M */
-    mov r9=ar.csd
-    mov r10=ar.ssd
-    movl r11=FPSR_DEFAULT	/* L-unit */
-    ;;
-    st8 [r16]=r25,16		/* save ar.unat */
-    st8 [r17]=r26,16		/* save ar.pfs */
-    shl r18=r18,16		/* compute ar.rsc to be used for "loadrs" */
-    ;;
-    st8 [r16]=r27,16		/* save ar.rsc */
-    adds r17=16,r17		/* skip over ar_rnat field */
-    ;;
-    st8 [r17]=r31,16		/* save predicates */
-    adds r16=16,r16		/* skip over ar_bspstore field */
-    ;;
-    st8 [r16]=r29,16		/* save b0 */
-    st8 [r17]=r18,16		/* save ar.rsc value for "loadrs" */
-    ;;
+	st8 [r16]=r28,16		/* save cr.iip */
+	st8 [r17]=r30,16		/* save cr.ifs */
+	mov r8=ar.fpsr		/* M */
+	mov r9=ar.csd
+	mov r10=ar.ssd
+	movl r11=FPSR_DEFAULT	/* L-unit */
+	;;
+	st8 [r16]=r25,16		/* save ar.unat */
+	st8 [r17]=r26,16		/* save ar.pfs */
+	shl r18=r18,16		/* compute ar.rsc to be used for "loadrs" */
+	;;
+	st8 [r16]=r27,16		/* save ar.rsc */
+	adds r17=16,r17		/* skip over ar_rnat field */
+	;;
+	st8 [r17]=r31,16		/* save predicates */
+	adds r16=16,r16		/* skip over ar_bspstore field */
+	;;
+	st8 [r16]=r29,16		/* save b0 */
+	st8 [r17]=r18,16		/* save ar.rsc value for "loadrs" */
+	;;
 .mem.offset 0,0; st8.spill [r16]=r20,16    /* save original r1 */
 .mem.offset 8,0; st8.spill [r17]=r12,16
-    adds r12=-16,r1
-    /* switch to kernel memory stack (with 16 bytes of scratch) */
-    ;;
+	adds r12=-16,r1
+	/* switch to kernel memory stack (with 16 bytes of scratch) */
+	;;
 .mem.offset 0,0; st8.spill [r16]=r13,16
 .mem.offset 8,0; st8.spill [r17]=r8,16 /* save ar.fpsr */
-    ;;
+	;;
 .mem.offset 0,0; st8.spill [r16]=r15,16
 .mem.offset 8,0; st8.spill [r17]=r14,16
-    dep r14=-1,r0,60,4
-    ;;
+	dep r14=-1,r0,60,4
+	;;
 .mem.offset 0,0; st8.spill [r16]=r2,16
 .mem.offset 8,0; st8.spill [r17]=r3,16
-    adds r2=VMM_PT_REGS_R16_OFFSET,r1
-    adds r14 = VMM_VCPU_GP_OFFSET,r13
-    ;;
-    mov r8=ar.ccv
-    ld8 r14 = [r14]
-    ;;
-    mov r1=r14       /* establish kernel global pointer */
-    ;;                                          \
-    bsw.1
-    ;;
-    alloc r14=ar.pfs,0,0,1,0	// must be first in an insn group
-    mov out0=r13
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i
-    ;;
-    //(p15) ssm psr.i
-    adds r3=8,r2		// set up second base pointer for SAVE_REST
-    srlz.i			// ensure everybody knows psr.ic is back on
-    ;;
+	adds r2=VMM_PT_REGS_R16_OFFSET,r1
+	adds r14 = VMM_VCPU_GP_OFFSET,r13
+	;;
+	mov r8=ar.ccv
+	ld8 r14 = [r14]
+	;;
+	mov r1=r14       /* establish kernel global pointer */
+	;;                                          \
+	bsw.1
+	;;
+	alloc r14=ar.pfs,0,0,1,0	// must be first in an insn group
+	mov out0=r13
+	;;
+	ssm psr.ic
+	;;
+	srlz.i
+	;;
+	//(p15) ssm psr.i
+	adds r3=8,r2		// set up second base pointer for SAVE_REST
+	srlz.i			// ensure everybody knows psr.ic is back on
+	;;
 .mem.offset 0,0; st8.spill [r2]=r16,16
 .mem.offset 8,0; st8.spill [r3]=r17,16
-    ;;
+	;;
 .mem.offset 0,0; st8.spill [r2]=r18,16
 .mem.offset 8,0; st8.spill [r3]=r19,16
-    ;;
+	;;
 .mem.offset 0,0; st8.spill [r2]=r20,16
 .mem.offset 8,0; st8.spill [r3]=r21,16
-    mov r18=b6
-    ;;
+	mov r18=b6
+	;;
 .mem.offset 0,0; st8.spill [r2]=r22,16
 .mem.offset 8,0; st8.spill [r3]=r23,16
-    mov r19=b7
-    ;;
+	mov r19=b7
+	;;
 .mem.offset 0,0; st8.spill [r2]=r24,16
 .mem.offset 8,0; st8.spill [r3]=r25,16
-    ;;
+	;;
 .mem.offset 0,0; st8.spill [r2]=r26,16
 .mem.offset 8,0; st8.spill [r3]=r27,16
-    ;;
+	;;
 .mem.offset 0,0; st8.spill [r2]=r28,16
 .mem.offset 8,0; st8.spill [r3]=r29,16
-    ;;
+	;;
 .mem.offset 0,0; st8.spill [r2]=r30,16
 .mem.offset 8,0; st8.spill [r3]=r31,32
-    ;;
-    mov ar.fpsr=r11       /* M-unit */
-    st8 [r2]=r8,8         /* ar.ccv */
-    adds r24=PT(B6)-PT(F7),r3
-    ;;
-    stf.spill [r2]=f6,32
-    stf.spill [r3]=f7,32
-    ;;
-    stf.spill [r2]=f8,32
-    stf.spill [r3]=f9,32
-    ;;
-    stf.spill [r2]=f10
-    stf.spill [r3]=f11
-    adds r25=PT(B7)-PT(F11),r3
-    ;;
-    st8 [r24]=r18,16       /* b6 */
-    st8 [r25]=r19,16       /* b7 */
-    ;;
-    st8 [r24]=r9           /* ar.csd */
-    st8 [r25]=r10          /* ar.ssd */
-    ;;
-    srlz.d		// make sure we see the effect of cr.ivr
-    addl r14=@gprel(ia64_leave_nested),gp
-    ;;
-    mov rp=r14
-    br.call.sptk.many b6=kvm_ia64_handle_irq
-    ;;
+	;;
+	mov ar.fpsr=r11       /* M-unit */
+	st8 [r2]=r8,8         /* ar.ccv */
+	adds r24=PT(B6)-PT(F7),r3
+	;;
+	stf.spill [r2]=f6,32
+	stf.spill [r3]=f7,32
+	;;
+	stf.spill [r2]=f8,32
+	stf.spill [r3]=f9,32
+	;;
+	stf.spill [r2]=f10
+	stf.spill [r3]=f11
+	adds r25=PT(B7)-PT(F11),r3
+	;;
+	st8 [r24]=r18,16       /* b6 */
+	st8 [r25]=r19,16       /* b7 */
+	;;
+	st8 [r24]=r9           /* ar.csd */
+	st8 [r25]=r10          /* ar.ssd */
+	;;
+	srlz.d		// make sure we see the effect of cr.ivr
+	addl r14=@gprel(ia64_leave_nested),gp
+	;;
+	mov rp=r14
+	br.call.sptk.many b6=kvm_ia64_handle_irq
+	;;
 END(kvm_interrupt)
 
     .global kvm_dispatch_vexirq
@@ -421,387 +420,385 @@ END(kvm_interrupt)
 //////////////////////////////////////////////////////////////////////
 // 0x3400 Entry 13 (size 64 bundles) Reserved
 ENTRY(kvm_virtual_exirq)
-    mov r31=pr
-    mov r19=13
-    mov r30 =r0
-    ;;
+	mov r31=pr
+	mov r19=13
+	mov r30 =r0
+	;;
 kvm_dispatch_vexirq:
-    cmp.eq p6,p0 = 1,r30
-    ;;
-(p6)add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
-(p6)ld8 r1 = [r29]
-    ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,1,0
-    mov out0=r13
+	cmp.eq p6,p0 = 1,r30
+	;;
+(p6)	add r29 = VMM_VCPU_SAVED_GP_OFFSET,r21
+	;;
+(p6)	ld8 r1 = [r29]
+	;;
+	KVM_SAVE_MIN_WITH_COVER_R19
+	alloc r14=ar.pfs,0,0,1,0
+	mov out0=r13
 
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    KVM_SAVE_REST
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    mov rp=r14
-    br.call.sptk.many b6=kvm_vexirq
+	ssm psr.ic
+	;;
+	srlz.i // guarantee that interruption collection is on
+	;;
+	//(p15) ssm psr.i               // restore psr.i
+	adds r3=8,r2                // set up second base pointer
+	;;
+	KVM_SAVE_REST
+	addl r14=@gprel(ia64_leave_hypervisor),gp
+	;;
+	mov rp=r14
+	br.call.sptk.many b6=kvm_vexirq
 END(kvm_virtual_exirq)
 
     .org kvm_ia64_ivt+0x3800
 /////////////////////////////////////////////////////////////////////
 // 0x3800 Entry 14 (size 64 bundles) Reserved
-    KVM_FAULT(14)
-    // this code segment is from 2.6.16.13
-
+	KVM_FAULT(14)
+	// this code segment is from 2.6.16.13
 
     .org kvm_ia64_ivt+0x3c00
 ///////////////////////////////////////////////////////////////////////
 // 0x3c00 Entry 15 (size 64 bundles) Reserved
-    KVM_FAULT(15)
-
+	KVM_FAULT(15)
 
     .org kvm_ia64_ivt+0x4000
 ///////////////////////////////////////////////////////////////////////
 // 0x4000 Entry 16 (size 64 bundles) Reserved
-    KVM_FAULT(16)
+	KVM_FAULT(16)
 
     .org kvm_ia64_ivt+0x4400
 //////////////////////////////////////////////////////////////////////
 // 0x4400 Entry 17 (size 64 bundles) Reserved
-    KVM_FAULT(17)
+	KVM_FAULT(17)
 
     .org kvm_ia64_ivt+0x4800
 //////////////////////////////////////////////////////////////////////
 // 0x4800 Entry 18 (size 64 bundles) Reserved
-    KVM_FAULT(18)
+	KVM_FAULT(18)
 
     .org kvm_ia64_ivt+0x4c00
 //////////////////////////////////////////////////////////////////////
 // 0x4c00 Entry 19 (size 64 bundles) Reserved
-    KVM_FAULT(19)
+	KVM_FAULT(19)
 
     .org kvm_ia64_ivt+0x5000
 //////////////////////////////////////////////////////////////////////
 // 0x5000 Entry 20 (size 16 bundles) Page Not Present
 ENTRY(kvm_page_not_present)
-    KVM_REFLECT(20)
+	KVM_REFLECT(20)
 END(kvm_page_not_present)
 
     .org kvm_ia64_ivt+0x5100
 ///////////////////////////////////////////////////////////////////////
 // 0x5100 Entry 21 (size 16 bundles) Key Permission vector
 ENTRY(kvm_key_permission)
-    KVM_REFLECT(21)
+	KVM_REFLECT(21)
 END(kvm_key_permission)
 
     .org kvm_ia64_ivt+0x5200
 //////////////////////////////////////////////////////////////////////
 // 0x5200 Entry 22 (size 16 bundles) Instruction Access Rights (26)
 ENTRY(kvm_iaccess_rights)
-    KVM_REFLECT(22)
+	KVM_REFLECT(22)
 END(kvm_iaccess_rights)
 
     .org kvm_ia64_ivt+0x5300
 //////////////////////////////////////////////////////////////////////
 // 0x5300 Entry 23 (size 16 bundles) Data Access Rights (14,53)
 ENTRY(kvm_daccess_rights)
-    KVM_REFLECT(23)
+	KVM_REFLECT(23)
 END(kvm_daccess_rights)
 
     .org kvm_ia64_ivt+0x5400
 /////////////////////////////////////////////////////////////////////
 // 0x5400 Entry 24 (size 16 bundles) General Exception (5,32,34,36,38,39)
 ENTRY(kvm_general_exception)
-   KVM_REFLECT(24)
-   KVM_FAULT(24)
+	KVM_REFLECT(24)
+	KVM_FAULT(24)
 END(kvm_general_exception)
 
     .org kvm_ia64_ivt+0x5500
 //////////////////////////////////////////////////////////////////////
 // 0x5500 Entry 25 (size 16 bundles) Disabled FP-Register (35)
 ENTRY(kvm_disabled_fp_reg)
-    KVM_REFLECT(25)
+	KVM_REFLECT(25)
 END(kvm_disabled_fp_reg)
 
     .org kvm_ia64_ivt+0x5600
 ////////////////////////////////////////////////////////////////////
 // 0x5600 Entry 26 (size 16 bundles) Nat Consumption (11,23,37,50)
 ENTRY(kvm_nat_consumption)
-    KVM_REFLECT(26)
+	KVM_REFLECT(26)
 END(kvm_nat_consumption)
 
     .org kvm_ia64_ivt+0x5700
 /////////////////////////////////////////////////////////////////////
 // 0x5700 Entry 27 (size 16 bundles) Speculation (40)
 ENTRY(kvm_speculation_vector)
-    KVM_REFLECT(27)
+	KVM_REFLECT(27)
 END(kvm_speculation_vector)
 
     .org kvm_ia64_ivt+0x5800
 /////////////////////////////////////////////////////////////////////
 // 0x5800 Entry 28 (size 16 bundles) Reserved
-    KVM_FAULT(28)
+	KVM_FAULT(28)
 
     .org kvm_ia64_ivt+0x5900
 ///////////////////////////////////////////////////////////////////
 // 0x5900 Entry 29 (size 16 bundles) Debug (16,28,56)
 ENTRY(kvm_debug_vector)
-    KVM_FAULT(29)
+	KVM_FAULT(29)
 END(kvm_debug_vector)
 
     .org kvm_ia64_ivt+0x5a00
 ///////////////////////////////////////////////////////////////
 // 0x5a00 Entry 30 (size 16 bundles) Unaligned Reference (57)
 ENTRY(kvm_unaligned_access)
-    KVM_REFLECT(30)
+	KVM_REFLECT(30)
 END(kvm_unaligned_access)
 
     .org kvm_ia64_ivt+0x5b00
 //////////////////////////////////////////////////////////////////////
 // 0x5b00 Entry 31 (size 16 bundles) Unsupported Data Reference (57)
 ENTRY(kvm_unsupported_data_reference)
-    KVM_REFLECT(31)
+	KVM_REFLECT(31)
 END(kvm_unsupported_data_reference)
 
     .org kvm_ia64_ivt+0x5c00
 ////////////////////////////////////////////////////////////////////
 // 0x5c00 Entry 32 (size 16 bundles) Floating Point FAULT (65)
 ENTRY(kvm_floating_point_fault)
-    KVM_REFLECT(32)
+	KVM_REFLECT(32)
 END(kvm_floating_point_fault)
 
     .org kvm_ia64_ivt+0x5d00
 /////////////////////////////////////////////////////////////////////
 // 0x5d00 Entry 33 (size 16 bundles) Floating Point Trap (66)
 ENTRY(kvm_floating_point_trap)
-    KVM_REFLECT(33)
+	KVM_REFLECT(33)
 END(kvm_floating_point_trap)
 
     .org kvm_ia64_ivt+0x5e00
 //////////////////////////////////////////////////////////////////////
 // 0x5e00 Entry 34 (size 16 bundles) Lower Privilege Transfer Trap (66)
 ENTRY(kvm_lower_privilege_trap)
-    KVM_REFLECT(34)
+	KVM_REFLECT(34)
 END(kvm_lower_privilege_trap)
 
     .org kvm_ia64_ivt+0x5f00
 //////////////////////////////////////////////////////////////////////
 // 0x5f00 Entry 35 (size 16 bundles) Taken Branch Trap (68)
 ENTRY(kvm_taken_branch_trap)
-    KVM_REFLECT(35)
+	KVM_REFLECT(35)
 END(kvm_taken_branch_trap)
 
     .org kvm_ia64_ivt+0x6000
 ////////////////////////////////////////////////////////////////////
 // 0x6000 Entry 36 (size 16 bundles) Single Step Trap (69)
 ENTRY(kvm_single_step_trap)
-    KVM_REFLECT(36)
+	KVM_REFLECT(36)
 END(kvm_single_step_trap)
     .global kvm_virtualization_fault_back
     .org kvm_ia64_ivt+0x6100
 /////////////////////////////////////////////////////////////////////
 // 0x6100 Entry 37 (size 16 bundles) Virtualization Fault
 ENTRY(kvm_virtualization_fault)
-    mov r31=pr
-    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
-    st8 [r16] = r1
-    adds r17 = VMM_VCPU_GP_OFFSET, r21
-    ;;
-    ld8 r1 = [r17]
-    cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
-    cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
-    cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
-    cmp.eq p9,p0=EVENT_RSM,r24
-    cmp.eq p10,p0=EVENT_SSM,r24
-    cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
-    cmp.eq p12,p0=EVENT_THASH,r24
-    (p6) br.dptk.many kvm_asm_mov_from_ar
-    (p7) br.dptk.many kvm_asm_mov_from_rr
-    (p8) br.dptk.many kvm_asm_mov_to_rr
-    (p9) br.dptk.many kvm_asm_rsm
-    (p10) br.dptk.many kvm_asm_ssm
-    (p11) br.dptk.many kvm_asm_mov_to_psr
-    (p12) br.dptk.many kvm_asm_thash
-    ;;
+	mov r31=pr
+	adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+	;;
+	st8 [r16] = r1
+	adds r17 = VMM_VCPU_GP_OFFSET, r21
+	;;
+	ld8 r1 = [r17]
+	cmp.eq p6,p0=EVENT_MOV_FROM_AR,r24
+	cmp.eq p7,p0=EVENT_MOV_FROM_RR,r24
+	cmp.eq p8,p0=EVENT_MOV_TO_RR,r24
+	cmp.eq p9,p0=EVENT_RSM,r24
+	cmp.eq p10,p0=EVENT_SSM,r24
+	cmp.eq p11,p0=EVENT_MOV_TO_PSR,r24
+	cmp.eq p12,p0=EVENT_THASH,r24
+(p6)	br.dptk.many kvm_asm_mov_from_ar
+(p7)	br.dptk.many kvm_asm_mov_from_rr
+(p8)	br.dptk.many kvm_asm_mov_to_rr
+(p9)	br.dptk.many kvm_asm_rsm
+(p10)	br.dptk.many kvm_asm_ssm
+(p11)	br.dptk.many kvm_asm_mov_to_psr
+(p12)	br.dptk.many kvm_asm_thash
+	;;
 kvm_virtualization_fault_back:
-    adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
-    ;;
-    ld8 r1 = [r16]
-    ;;
-    mov r19=37
-    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
-    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
-    ;;
-    st8 [r16] = r24
-    st8 [r17] = r25
-    ;;
-    cmp.ne p6,p0=EVENT_RFI, r24
-    (p6) br.sptk kvm_dispatch_virtualization_fault
-    ;;
-    adds r18=VMM_VPD_BASE_OFFSET,r21
-    ;;
-    ld8 r18=[r18]
-    ;;
-    adds r18=VMM_VPD_VIFS_OFFSET,r18
-    ;;
-    ld8 r18=[r18]
-    ;;
-    tbit.z p6,p0=r18,63
-    (p6) br.sptk kvm_dispatch_virtualization_fault
-    ;;
-    //if vifs.v=1 desert current register frame
-    alloc r18=ar.pfs,0,0,0,0
-    br.sptk kvm_dispatch_virtualization_fault
+	adds r16 = VMM_VCPU_SAVED_GP_OFFSET,r21
+	;;
+	ld8 r1 = [r16]
+	;;
+	mov r19=37
+	adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+	adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+	;;
+	st8 [r16] = r24
+	st8 [r17] = r25
+	;;
+	cmp.ne p6,p0=EVENT_RFI, r24
+(p6)	br.sptk kvm_dispatch_virtualization_fault
+	;;
+	adds r18=VMM_VPD_BASE_OFFSET,r21
+	;;
+	ld8 r18=[r18]
+	;;
+	adds r18=VMM_VPD_VIFS_OFFSET,r18
+	;;
+	ld8 r18=[r18]
+	;;
+	tbit.z p6,p0=r18,63
+(p6)	br.sptk kvm_dispatch_virtualization_fault
+	;;
+//if vifs.v=1 desert current register frame
+	alloc r18=ar.pfs,0,0,0,0
+	br.sptk kvm_dispatch_virtualization_fault
 END(kvm_virtualization_fault)
 
     .org kvm_ia64_ivt+0x6200
 //////////////////////////////////////////////////////////////
 // 0x6200 Entry 38 (size 16 bundles) Reserved
-    KVM_FAULT(38)
+	KVM_FAULT(38)
 
     .org kvm_ia64_ivt+0x6300
 /////////////////////////////////////////////////////////////////
 // 0x6300 Entry 39 (size 16 bundles) Reserved
-    KVM_FAULT(39)
+	KVM_FAULT(39)
 
     .org kvm_ia64_ivt+0x6400
 /////////////////////////////////////////////////////////////////
 // 0x6400 Entry 40 (size 16 bundles) Reserved
-    KVM_FAULT(40)
+	KVM_FAULT(40)
 
     .org kvm_ia64_ivt+0x6500
 //////////////////////////////////////////////////////////////////
 // 0x6500 Entry 41 (size 16 bundles) Reserved
-    KVM_FAULT(41)
+	KVM_FAULT(41)
 
     .org kvm_ia64_ivt+0x6600
 //////////////////////////////////////////////////////////////////
 // 0x6600 Entry 42 (size 16 bundles) Reserved
-    KVM_FAULT(42)
+	KVM_FAULT(42)
 
     .org kvm_ia64_ivt+0x6700
 //////////////////////////////////////////////////////////////////
 // 0x6700 Entry 43 (size 16 bundles) Reserved
-    KVM_FAULT(43)
+	KVM_FAULT(43)
 
     .org kvm_ia64_ivt+0x6800
 //////////////////////////////////////////////////////////////////
 // 0x6800 Entry 44 (size 16 bundles) Reserved
-    KVM_FAULT(44)
+	KVM_FAULT(44)
 
     .org kvm_ia64_ivt+0x6900
 ///////////////////////////////////////////////////////////////////
 // 0x6900 Entry 45 (size 16 bundles) IA-32 Exeception
 //(17,18,29,41,42,43,44,58,60,61,62,72,73,75,76,77)
 ENTRY(kvm_ia32_exception)
-    KVM_FAULT(45)
+	KVM_FAULT(45)
 END(kvm_ia32_exception)
 
     .org kvm_ia64_ivt+0x6a00
 ////////////////////////////////////////////////////////////////////
 // 0x6a00 Entry 46 (size 16 bundles) IA-32 Intercept  (30,31,59,70,71)
 ENTRY(kvm_ia32_intercept)
-    KVM_FAULT(47)
+	KVM_FAULT(47)
 END(kvm_ia32_intercept)
 
     .org kvm_ia64_ivt+0x6c00
 /////////////////////////////////////////////////////////////////////
 // 0x6c00 Entry 48 (size 16 bundles) Reserved
-    KVM_FAULT(48)
+	KVM_FAULT(48)
 
     .org kvm_ia64_ivt+0x6d00
 //////////////////////////////////////////////////////////////////////
 // 0x6d00 Entry 49 (size 16 bundles) Reserved
-    KVM_FAULT(49)
+	KVM_FAULT(49)
 
     .org kvm_ia64_ivt+0x6e00
 //////////////////////////////////////////////////////////////////////
 // 0x6e00 Entry 50 (size 16 bundles) Reserved
-    KVM_FAULT(50)
+	KVM_FAULT(50)
 
     .org kvm_ia64_ivt+0x6f00
 /////////////////////////////////////////////////////////////////////
 // 0x6f00 Entry 51 (size 16 bundles) Reserved
-    KVM_FAULT(52)
+	KVM_FAULT(52)
 
     .org kvm_ia64_ivt+0x7100
 ////////////////////////////////////////////////////////////////////
 // 0x7100 Entry 53 (size 16 bundles) Reserved
-    KVM_FAULT(53)
+	KVM_FAULT(53)
 
     .org kvm_ia64_ivt+0x7200
 /////////////////////////////////////////////////////////////////////
 // 0x7200 Entry 54 (size 16 bundles) Reserved
-    KVM_FAULT(54)
+	KVM_FAULT(54)
 
     .org kvm_ia64_ivt+0x7300
 ////////////////////////////////////////////////////////////////////
 // 0x7300 Entry 55 (size 16 bundles) Reserved
-    KVM_FAULT(55)
+	KVM_FAULT(55)
 
     .org kvm_ia64_ivt+0x7400
 ////////////////////////////////////////////////////////////////////
 // 0x7400 Entry 56 (size 16 bundles) Reserved
-    KVM_FAULT(56)
+	KVM_FAULT(56)
 
     .org kvm_ia64_ivt+0x7500
 /////////////////////////////////////////////////////////////////////
 // 0x7500 Entry 57 (size 16 bundles) Reserved
-    KVM_FAULT(57)
+	KVM_FAULT(57)
 
     .org kvm_ia64_ivt+0x7600
 /////////////////////////////////////////////////////////////////////
 // 0x7600 Entry 58 (size 16 bundles) Reserved
-    KVM_FAULT(58)
+	KVM_FAULT(58)
 
     .org kvm_ia64_ivt+0x7700
 ////////////////////////////////////////////////////////////////////
 // 0x7700 Entry 59 (size 16 bundles) Reserved
-    KVM_FAULT(59)
+	KVM_FAULT(59)
 
     .org kvm_ia64_ivt+0x7800
 ////////////////////////////////////////////////////////////////////
 // 0x7800 Entry 60 (size 16 bundles) Reserved
-    KVM_FAULT(60)
+	KVM_FAULT(60)
 
     .org kvm_ia64_ivt+0x7900
 /////////////////////////////////////////////////////////////////////
 // 0x7900 Entry 61 (size 16 bundles) Reserved
-    KVM_FAULT(61)
+	KVM_FAULT(61)
 
     .org kvm_ia64_ivt+0x7a00
 /////////////////////////////////////////////////////////////////////
 // 0x7a00 Entry 62 (size 16 bundles) Reserved
-    KVM_FAULT(62)
+	KVM_FAULT(62)
 
     .org kvm_ia64_ivt+0x7b00
 /////////////////////////////////////////////////////////////////////
 // 0x7b00 Entry 63 (size 16 bundles) Reserved
-    KVM_FAULT(63)
+	KVM_FAULT(63)
 
     .org kvm_ia64_ivt+0x7c00
 ////////////////////////////////////////////////////////////////////
 // 0x7c00 Entry 64 (size 16 bundles) Reserved
-    KVM_FAULT(64)
+	KVM_FAULT(64)
 
     .org kvm_ia64_ivt+0x7d00
 /////////////////////////////////////////////////////////////////////
 // 0x7d00 Entry 65 (size 16 bundles) Reserved
-    KVM_FAULT(65)
+	KVM_FAULT(65)
 
     .org kvm_ia64_ivt+0x7e00
 /////////////////////////////////////////////////////////////////////
 // 0x7e00 Entry 66 (size 16 bundles) Reserved
-    KVM_FAULT(66)
+	KVM_FAULT(66)
 
     .org kvm_ia64_ivt+0x7f00
 ////////////////////////////////////////////////////////////////////
 // 0x7f00 Entry 67 (size 16 bundles) Reserved
-    KVM_FAULT(67)
+	KVM_FAULT(67)
 
     .org kvm_ia64_ivt+0x8000
 // There is no particular reason for this code to be here, other than that
@@ -811,132 +808,128 @@ END(kvm_ia32_intercept)
 
 
 ENTRY(kvm_dtlb_miss_dispatch)
-    mov r19 = 2
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,3,0
-    mov out0=cr.ifa
-    mov out1=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
-    ;;
-    KVM_SAVE_REST
-    KVM_SAVE_EXTRA
-    mov rp=r14
-    ;;
-    adds out2=16,r12
-    br.call.sptk.many b6=kvm_page_fault
+	mov r19 = 2
+	KVM_SAVE_MIN_WITH_COVER_R19
+	alloc r14=ar.pfs,0,0,3,0
+	mov out0=cr.ifa
+	mov out1=r15
+	adds r3=8,r2                // set up second base pointer
+	;;
+	ssm psr.ic
+	;;
+	srlz.i     // guarantee that interruption collection is on
+	;;
+	//(p15) ssm psr.i               // restore psr.i
+	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+	;;
+	KVM_SAVE_REST
+	KVM_SAVE_EXTRA
+	mov rp=r14
+	;;
+	adds out2=16,r12
+	br.call.sptk.many b6=kvm_page_fault
 END(kvm_dtlb_miss_dispatch)
 
 ENTRY(kvm_itlb_miss_dispatch)
 
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,3,0
-    mov out0=cr.ifa
-    mov out1=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    adds out2=16,r12
-    br.call.sptk.many b6=kvm_page_fault
+	KVM_SAVE_MIN_WITH_COVER_R19
+	alloc r14=ar.pfs,0,0,3,0
+	mov out0=cr.ifa
+	mov out1=r15
+	adds r3=8,r2                // set up second base pointer
+	;;
+	ssm psr.ic
+	;;
+	srlz.i   // guarantee that interruption collection is on
+	;;
+	//(p15) ssm psr.i               // restore psr.i
+	addl r14=@gprel(ia64_leave_hypervisor),gp
+	;;
+	KVM_SAVE_REST
+	mov rp=r14
+	;;
+	adds out2=16,r12
+	br.call.sptk.many b6=kvm_page_fault
 END(kvm_itlb_miss_dispatch)
 
 ENTRY(kvm_dispatch_reflection)
-    /*
-     * Input:
-     *  psr.ic: off
-     *  r19:    intr type (offset into ivt, see ia64_int.h)
-     *  r31:    contains saved predicates (pr)
-     */
-    KVM_SAVE_MIN_WITH_COVER_R19
-    alloc r14=ar.pfs,0,0,5,0
-    mov out0=cr.ifa
-    mov out1=cr.isr
-    mov out2=cr.iim
-    mov out3=r15
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    adds out4=16,r12
-    br.call.sptk.many b6=reflect_interruption
+/*
+ * Input:
+ *  psr.ic: off
+ *  r19:    intr type (offset into ivt, see ia64_int.h)
+ *  r31:    contains saved predicates (pr)
+ */
+	KVM_SAVE_MIN_WITH_COVER_R19
+	alloc r14=ar.pfs,0,0,5,0
+	mov out0=cr.ifa
+	mov out1=cr.isr
+	mov out2=cr.iim
+	mov out3=r15
+	adds r3=8,r2                // set up second base pointer
+	;;
+	ssm psr.ic
+	;;
+	srlz.i   // guarantee that interruption collection is on
+	;;
+	//(p15) ssm psr.i               // restore psr.i
+	addl r14=@gprel(ia64_leave_hypervisor),gp
+	;;
+	KVM_SAVE_REST
+	mov rp=r14
+	;;
+	adds out4=16,r12
+	br.call.sptk.many b6=reflect_interruption
 END(kvm_dispatch_reflection)
 
 ENTRY(kvm_dispatch_virtualization_fault)
-    adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
-    adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
-    ;;
-    st8 [r16] = r24
-    st8 [r17] = r25
-    ;;
-    KVM_SAVE_MIN_WITH_COVER_R19
-    ;;
-    alloc r14=ar.pfs,0,0,2,0 // now it's safe (must be first in insn group!)
-    mov out0=r13        //vcpu
-    adds r3=8,r2                // set up second base pointer
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i                  // guarantee that interruption collection is on
-    ;;
-    //(p15) ssm psr.i               // restore psr.i
-    addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
-    ;;
-    KVM_SAVE_REST
-    KVM_SAVE_EXTRA
-    mov rp=r14
-    ;;
-    adds out1=16,sp         //regs
-    br.call.sptk.many b6=kvm_emulate
+	adds r16 = VMM_VCPU_CAUSE_OFFSET,r21
+	adds r17 = VMM_VCPU_OPCODE_OFFSET,r21
+	;;
+	st8 [r16] = r24
+	st8 [r17] = r25
+	;;
+	KVM_SAVE_MIN_WITH_COVER_R19
+	;;
+	alloc r14=ar.pfs,0,0,2,0 // (must be first in insn group!)
+	mov out0=r13        //vcpu
+	adds r3=8,r2                // set up second base pointer
+	;;
+	ssm psr.ic
+	;;
+	srlz.i    // guarantee that interruption collection is on
+	;;
+	//(p15) ssm psr.i               // restore psr.i
+	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
+	;;
+	KVM_SAVE_REST
+	KVM_SAVE_EXTRA
+	mov rp=r14
+	;;
+	adds out1=16,sp         //regs
+	br.call.sptk.many b6=kvm_emulate
 END(kvm_dispatch_virtualization_fault)
 
 
 ENTRY(kvm_dispatch_interrupt)
-    KVM_SAVE_MIN_WITH_COVER_R19	// uses r31; defines r2 and r3
-    ;;
-    alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
-    //mov out0=cr.ivr		// pass cr.ivr as first arg
-    adds r3=8,r2		// set up second base pointer for SAVE_REST
-    ;;
-    ssm psr.ic
-    ;;
-    srlz.i
-    ;;
-    //(p15) ssm psr.i
-    addl r14=@gprel(ia64_leave_hypervisor),gp
-    ;;
-    KVM_SAVE_REST
-    mov rp=r14
-    ;;
-    mov out0=r13		// pass pointer to pt_regs as second arg
-    br.call.sptk.many b6=kvm_ia64_handle_irq
+	KVM_SAVE_MIN_WITH_COVER_R19	// uses r31; defines r2 and r3
+	;;
+	alloc r14=ar.pfs,0,0,1,0 // must be first in an insn group
+	adds r3=8,r2		// set up second base pointer for SAVE_REST
+	;;
+	ssm psr.ic
+	;;
+	srlz.i
+	;;
+	//(p15) ssm psr.i
+	addl r14=@gprel(ia64_leave_hypervisor),gp
+	;;
+	KVM_SAVE_REST
+	mov rp=r14
+	;;
+	mov out0=r13		// pass pointer to pt_regs as second arg
+	br.call.sptk.many b6=kvm_ia64_handle_irq
 END(kvm_dispatch_interrupt)
 
-
-
-
 GLOBAL_ENTRY(ia64_leave_nested)
 	rsm psr.i
 	;;
@@ -1015,7 +1008,7 @@ GLOBAL_ENTRY(ia64_leave_nested)
 	;;
 	ldf.fill f11=[r2]
 //	mov r18=r13
-//    mov r21=r13
+//	mov r21=r13
 	adds r16=PT(CR_IPSR)+16,r12
 	adds r17=PT(CR_IIP)+16,r12
 	;;
@@ -1065,138 +1058,135 @@ GLOBAL_ENTRY(ia64_leave_nested)
 	rfi
 END(ia64_leave_nested)
 
-
-
 GLOBAL_ENTRY(ia64_leave_hypervisor_prepare)
-    /*
-     * work.need_resched etc. mustn't get changed
-     *by this CPU before it returns to
-    ;;
-     * user- or fsys-mode, hence we disable interrupts early on:
-     */
-    adds r2 = PT(R4)+16,r12
-    adds r3 = PT(R5)+16,r12
-    adds r8 = PT(EML_UNAT)+16,r12
-    ;;
-    ld8 r8 = [r8]
-    ;;
-    mov ar.unat=r8
-    ;;
-    ld8.fill r4=[r2],16    //load r4
-    ld8.fill r5=[r3],16    //load r5
-    ;;
-    ld8.fill r6=[r2]    //load r6
-    ld8.fill r7=[r3]    //load r7
-    ;;
+/*
+ * work.need_resched etc. mustn't get changed
+ *by this CPU before it returns to
+ * user- or fsys-mode, hence we disable interrupts early on:
+ */
+	adds r2 = PT(R4)+16,r12
+	adds r3 = PT(R5)+16,r12
+	adds r8 = PT(EML_UNAT)+16,r12
+	;;
+	ld8 r8 = [r8]
+	;;
+	mov ar.unat=r8
+	;;
+	ld8.fill r4=[r2],16    //load r4
+	ld8.fill r5=[r3],16    //load r5
+	;;
+	ld8.fill r6=[r2]    //load r6
+	ld8.fill r7=[r3]    //load r7
+	;;
 END(ia64_leave_hypervisor_prepare)
 //fall through
 GLOBAL_ENTRY(ia64_leave_hypervisor)
-    rsm psr.i
-    ;;
-    br.call.sptk.many b0=leave_hypervisor_tail
-    ;;
-    adds r20=PT(PR)+16,r12
-    adds r8=PT(EML_UNAT)+16,r12
-    ;;
-    ld8 r8=[r8]
-    ;;
-    mov ar.unat=r8
-    ;;
-    lfetch [r20],PT(CR_IPSR)-PT(PR)
-    adds r2 = PT(B6)+16,r12
-    adds r3 = PT(B7)+16,r12
-    ;;
-    lfetch [r20]
-    ;;
-    ld8 r24=[r2],16        /* B6 */
-    ld8 r25=[r3],16        /* B7 */
-    ;;
-    ld8 r26=[r2],16        /* ar_csd */
-    ld8 r27=[r3],16        /* ar_ssd */
-    mov b6 = r24
-    ;;
-    ld8.fill r8=[r2],16
-    ld8.fill r9=[r3],16
-    mov b7 = r25
-    ;;
-    mov ar.csd = r26
-    mov ar.ssd = r27
-    ;;
-    ld8.fill r10=[r2],PT(R15)-PT(R10)
-    ld8.fill r11=[r3],PT(R14)-PT(R11)
-    ;;
-    ld8.fill r15=[r2],PT(R16)-PT(R15)
-    ld8.fill r14=[r3],PT(R17)-PT(R14)
-    ;;
-    ld8.fill r16=[r2],16
-    ld8.fill r17=[r3],16
-    ;;
-    ld8.fill r18=[r2],16
-    ld8.fill r19=[r3],16
-    ;;
-    ld8.fill r20=[r2],16
-    ld8.fill r21=[r3],16
-    ;;
-    ld8.fill r22=[r2],16
-    ld8.fill r23=[r3],16
-    ;;
-    ld8.fill r24=[r2],16
-    ld8.fill r25=[r3],16
-    ;;
-    ld8.fill r26=[r2],16
-    ld8.fill r27=[r3],16
-    ;;
-    ld8.fill r28=[r2],16
-    ld8.fill r29=[r3],16
-    ;;
-    ld8.fill r30=[r2],PT(F6)-PT(R30)
-    ld8.fill r31=[r3],PT(F7)-PT(R31)
-    ;;
-    rsm psr.i | psr.ic
-    // initiate turning off of interrupt and interruption collection
-    invala          // invalidate ALAT
-    ;;
-    srlz.i          // ensure interruption collection is off
-    ;;
-    bsw.0
-    ;;
-    adds r16 = PT(CR_IPSR)+16,r12
-    adds r17 = PT(CR_IIP)+16,r12
-    mov r21=r13		// get current
-    ;;
-    ld8 r31=[r16],16    // load cr.ipsr
-    ld8 r30=[r17],16    // load cr.iip
-    ;;
-    ld8 r29=[r16],16    // load cr.ifs
-    ld8 r28=[r17],16    // load ar.unat
-    ;;
-    ld8 r27=[r16],16    // load ar.pfs
-    ld8 r26=[r17],16    // load ar.rsc
-    ;;
-    ld8 r25=[r16],16    // load ar.rnat
-    ld8 r24=[r17],16    // load ar.bspstore
-    ;;
-    ld8 r23=[r16],16    // load predicates
-    ld8 r22=[r17],16    // load b0
-    ;;
-    ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
-    ld8.fill r1=[r17],16    //load r1
-    ;;
-    ld8.fill r12=[r16],16    //load r12
-    ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
-    ;;
-    ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
-    ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
-    ;;
-    ld8.fill r3=[r16]	//load r3
-    ld8 r18=[r17]	//load ar_ccv
-    ;;
-    mov ar.fpsr=r19
-    mov ar.ccv=r18
-    shr.u r18=r20,16
-    ;;
+	rsm psr.i
+	;;
+	br.call.sptk.many b0=leave_hypervisor_tail
+	;;
+	adds r20=PT(PR)+16,r12
+	adds r8=PT(EML_UNAT)+16,r12
+	;;
+	ld8 r8=[r8]
+	;;
+	mov ar.unat=r8
+	;;
+	lfetch [r20],PT(CR_IPSR)-PT(PR)
+	adds r2 = PT(B6)+16,r12
+	adds r3 = PT(B7)+16,r12
+	;;
+	lfetch [r20]
+	;;
+	ld8 r24=[r2],16        /* B6 */
+	ld8 r25=[r3],16        /* B7 */
+	;;
+	ld8 r26=[r2],16        /* ar_csd */
+	ld8 r27=[r3],16        /* ar_ssd */
+	mov b6 = r24
+	;;
+	ld8.fill r8=[r2],16
+	ld8.fill r9=[r3],16
+	mov b7 = r25
+	;;
+	mov ar.csd = r26
+	mov ar.ssd = r27
+	;;
+	ld8.fill r10=[r2],PT(R15)-PT(R10)
+	ld8.fill r11=[r3],PT(R14)-PT(R11)
+	;;
+	ld8.fill r15=[r2],PT(R16)-PT(R15)
+	ld8.fill r14=[r3],PT(R17)-PT(R14)
+	;;
+	ld8.fill r16=[r2],16
+	ld8.fill r17=[r3],16
+	;;
+	ld8.fill r18=[r2],16
+	ld8.fill r19=[r3],16
+	;;
+	ld8.fill r20=[r2],16
+	ld8.fill r21=[r3],16
+	;;
+	ld8.fill r22=[r2],16
+	ld8.fill r23=[r3],16
+	;;
+	ld8.fill r24=[r2],16
+	ld8.fill r25=[r3],16
+	;;
+	ld8.fill r26=[r2],16
+	ld8.fill r27=[r3],16
+	;;
+	ld8.fill r28=[r2],16
+	ld8.fill r29=[r3],16
+	;;
+	ld8.fill r30=[r2],PT(F6)-PT(R30)
+	ld8.fill r31=[r3],PT(F7)-PT(R31)
+	;;
+	rsm psr.i | psr.ic
+	// initiate turning off of interrupt and interruption collection
+	invala          // invalidate ALAT
+	;;
+	srlz.i          // ensure interruption collection is off
+	;;
+	bsw.0
+	;;
+	adds r16 = PT(CR_IPSR)+16,r12
+	adds r17 = PT(CR_IIP)+16,r12
+	mov r21=r13		// get current
+	;;
+	ld8 r31=[r16],16    // load cr.ipsr
+	ld8 r30=[r17],16    // load cr.iip
+	;;
+	ld8 r29=[r16],16    // load cr.ifs
+	ld8 r28=[r17],16    // load ar.unat
+	;;
+	ld8 r27=[r16],16    // load ar.pfs
+	ld8 r26=[r17],16    // load ar.rsc
+	;;
+	ld8 r25=[r16],16    // load ar.rnat
+	ld8 r24=[r17],16    // load ar.bspstore
+	;;
+	ld8 r23=[r16],16    // load predicates
+	ld8 r22=[r17],16    // load b0
+	;;
+	ld8 r20=[r16],16    // load ar.rsc value for "loadrs"
+	ld8.fill r1=[r17],16    //load r1
+	;;
+	ld8.fill r12=[r16],16    //load r12
+	ld8.fill r13=[r17],PT(R2)-PT(R13)    //load r13
+	;;
+	ld8 r19=[r16],PT(R3)-PT(AR_FPSR)    //load ar_fpsr
+	ld8.fill r2=[r17],PT(AR_CCV)-PT(R2)    //load r2
+	;;
+	ld8.fill r3=[r16]	//load r3
+	ld8 r18=[r17]	//load ar_ccv
+	;;
+	mov ar.fpsr=r19
+	mov ar.ccv=r18
+	shr.u r18=r20,16
+	;;
 kvm_rbs_switch:
-    mov r19=96
+	mov r19=96
 
 kvm_dont_preserve_current_frame:
 /*
@@ -1208,76 +1198,76 @@ kvm_dont_preserve_current_frame:
 #   define pReturn	p7
 #   define Nregs	14
 
-    alloc loc0=ar.pfs,2,Nregs-2,2,0
-    shr.u loc1=r18,9		// RNaTslots <= floor(dirtySize / (64*8))
-    sub r19=r19,r18		// r19 = (physStackedSize + 8) - dirtySize
-    ;;
-    mov ar.rsc=r20		// load ar.rsc to be used for "loadrs"
-    shladd in0=loc1,3,r19
-    mov in1=0
-    ;;
-    TEXT_ALIGN(32)
+	alloc loc0=ar.pfs,2,Nregs-2,2,0
+	shr.u loc1=r18,9	// RNaTslots <= floor(dirtySize / (64*8))
+	sub r19=r19,r18		// r19 = (physStackedSize + 8) - dirtySize
+	;;
+	mov ar.rsc=r20		// load ar.rsc to be used for "loadrs"
+	shladd in0=loc1,3,r19
+	mov in1=0
+	;;
+	TEXT_ALIGN(32)
 kvm_rse_clear_invalid:
-    alloc loc0=ar.pfs,2,Nregs-2,2,0
-    cmp.lt pRecurse,p0=Nregs*8,in0
-    // if more than Nregs regs left to clear, (re)curse
-    add out0=-Nregs*8,in0
-    add out1=1,in1		// increment recursion count
-    mov loc1=0
-    mov loc2=0
-    ;;
-    mov loc3=0
-    mov loc4=0
-    mov loc5=0
-    mov loc6=0
-    mov loc7=0
+	alloc loc0=ar.pfs,2,Nregs-2,2,0
+	cmp.lt pRecurse,p0=Nregs*8,in0
+	// if more than Nregs regs left to clear, (re)curse
+	add out0=-Nregs*8,in0
+	add out1=1,in1		// increment recursion count
+	mov loc1=0
+	mov loc2=0
+	;;
+	mov loc3=0
+	mov loc4=0
+	mov loc5=0
+	mov loc6=0
+	mov loc7=0
 (pRecurse) br.call.dptk.few b0=kvm_rse_clear_invalid
-    ;;
-    mov loc8=0
-    mov loc9=0
-    cmp.ne pReturn,p0=r0,in1
-    // if recursion count != 0, we need to do a br.ret
-    mov loc10=0
-    mov loc11=0
+	;;
+	mov loc8=0
+	mov loc9=0
+	cmp.ne pReturn,p0=r0,in1
+	// if recursion count != 0, we need to do a br.ret
+	mov loc10=0
+	mov loc11=0
 (pReturn) br.ret.dptk.many b0
 
 #	undef pRecurse
 #	undef pReturn
 
 // loadrs has already been shifted
-    alloc r16=ar.pfs,0,0,0,0    // drop current register frame
-    ;;
-    loadrs
-    ;;
-    mov ar.bspstore=r24
-    ;;
-    mov ar.unat=r28
-    mov ar.rnat=r25
-    mov ar.rsc=r26
-    ;;
-    mov cr.ipsr=r31
-    mov cr.iip=r30
-    mov cr.ifs=r29
-    mov ar.pfs=r27
-    adds r18=VMM_VPD_BASE_OFFSET,r21
-    ;;
-    ld8 r18=[r18]   //vpd
-    adds r17=VMM_VCPU_ISR_OFFSET,r21
-    ;;
-    ld8 r17=[r17]
-    adds r19=VMM_VPD_VPSR_OFFSET,r18
-    ;;
-    ld8 r19=[r19]        //vpsr
-    mov r25=r18
-    adds r16= VMM_VCPU_GP_OFFSET,r21
-    ;;
-    ld8 r16= [r16] // Put gp in r24
-    movl r24=@gprel(ia64_vmm_entry)  // calculate return address
-    ;;
-    add  r24=r24,r16
-    ;;
-    br.sptk.many  kvm_vps_sync_write       // call the service
-    ;;
+	alloc r16=ar.pfs,0,0,0,0    // drop current register frame
+	;;
+	loadrs
+	;;
+	mov ar.bspstore=r24
+	;;
+	mov ar.unat=r28
+	mov ar.rnat=r25
+	mov ar.rsc=r26
+	;;
+	mov cr.ipsr=r31
+	mov cr.iip=r30
+	mov cr.ifs=r29
+	mov ar.pfs=r27
+	adds r18=VMM_VPD_BASE_OFFSET,r21
+	;;
+	ld8 r18=[r18]   //vpd
+	adds r17=VMM_VCPU_ISR_OFFSET,r21
+	;;
+	ld8 r17=[r17]
+	adds r19=VMM_VPD_VPSR_OFFSET,r18
+	;;
+	ld8 r19=[r19]        //vpsr
+	mov r25=r18
+	adds r16= VMM_VCPU_GP_OFFSET,r21
+	;;
+	ld8 r16= [r16] // Put gp in r24
+	movl r24=@gprel(ia64_vmm_entry)  // calculate return address
+	;;
+	add  r24=r24,r16
+	;;
+	br.sptk.many  kvm_vps_sync_write       // call the service
+	;;
 END(ia64_leave_hypervisor)
 // fall through
 GLOBAL_ENTRY(ia64_vmm_entry)
@@ -1290,16 +1280,14 @@ GLOBAL_ENTRY(ia64_vmm_entry)
  *  r22:b0
  *  r23:predicate
  */
-    mov r24=r22
-    mov r25=r18
-    tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
-    (p1) br.cond.sptk.few kvm_vps_resume_normal
-    (p2) br.cond.sptk.many kvm_vps_resume_handler
-    ;;
+	mov r24=r22
+	mov r25=r18
+	tbit.nz p1,p2 = r19,IA64_PSR_IC_BIT        // p1=vpsr.ic
+(p1) 	br.cond.sptk.few kvm_vps_resume_normal
+(p2)	br.cond.sptk.many kvm_vps_resume_handler
+	;;
 END(ia64_vmm_entry)
 
-
-
 /*
  * extern u64 ia64_call_vsa(u64 proc, u64 arg1, u64 arg2,
  *                  u64 arg3, u64 arg4, u64 arg5,
@@ -1317,88 +1305,88 @@ psrsave =   loc2
 entry   =   loc3
 hostret =   r24
 
-    alloc   pfssave=ar.pfs,4,4,0,0
-    mov rpsave=rp
-    adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
-    ;;
-    ld8 entry=[entry]
-1:  mov hostret=ip
-    mov r25=in1         // copy arguments
-    mov r26=in2
-    mov r27=in3
-    mov psrsave=psr
-    ;;
-    tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
-    tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
-    ;;
-    add hostret=2f-1b,hostret   // calculate return address
-    add entry=entry,in0
-    ;;
-    rsm psr.i | psr.ic
-    ;;
-    srlz.i
-    mov b6=entry
-    br.cond.sptk b6         // call the service
+	alloc   pfssave=ar.pfs,4,4,0,0
+	mov rpsave=rp
+	adds entry=VMM_VCPU_VSA_BASE_OFFSET, r13
+	;;
+	ld8 entry=[entry]
+1:	mov hostret=ip
+	mov r25=in1         // copy arguments
+	mov r26=in2
+	mov r27=in3
+	mov psrsave=psr
+	;;
+	tbit.nz p6,p0=psrsave,14    // IA64_PSR_I
+	tbit.nz p7,p0=psrsave,13    // IA64_PSR_IC
+	;;
+	add hostret=2f-1b,hostret   // calculate return address
+	add entry=entry,in0
+	;;
+	rsm psr.i | psr.ic
+	;;
+	srlz.i
+	mov b6=entry
+	br.cond.sptk b6         // call the service
 2:
-    // Architectural sequence for enabling interrupts if necessary
+// Architectural sequence for enabling interrupts if necessary
 (p7)    ssm psr.ic
-    ;;
+	;;
 (p7)    srlz.i
-    ;;
+	;;
 //(p6)    ssm psr.i
-    ;;
-    mov rp=rpsave
-    mov ar.pfs=pfssave
-    mov r8=r31
-    ;;
-    srlz.d
-    br.ret.sptk rp
+	;;
+	mov rp=rpsave
+	mov ar.pfs=pfssave
+	mov r8=r31
+	;;
+	srlz.d
+	br.ret.sptk rp
 
 END(ia64_call_vsa)
 
 #define  INIT_BSPSTORE  ((4<<30)-(12<<20)-0x100)
 
 GLOBAL_ENTRY(vmm_reset_entry)
-    //set up ipsr, iip, vpd.vpsr, dcr
-    // For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
-    // For DCR: all bits 0
-    bsw.0
-    ;;
-    mov r21 =r13
-    adds r14=-VMM_PT_REGS_SIZE, r12
-    ;;
-    movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
-    movl r10=0x8000000000000000
-    adds r16=PT(CR_IIP), r14
-    adds r20=PT(R1), r14
-    ;;
-    rsm psr.ic | psr.i
-    ;;
-    srlz.i
-    ;;
-    mov ar.rsc = 0
-    ;;
-    flushrs
-    ;;
-    mov ar.bspstore = 0
-    // clear BSPSTORE
-    ;;
-    mov cr.ipsr=r6
-    mov cr.ifs=r10
-    ld8 r4 = [r16] // Set init iip for first run.
-    ld8 r1 = [r20]
-    ;;
-    mov cr.iip=r4
-    adds r16=VMM_VPD_BASE_OFFSET,r13
-    ;;
-    ld8 r18=[r16]
-    ;;
-    adds r19=VMM_VPD_VPSR_OFFSET,r18
-    ;;
-    ld8 r19=[r19]
-    mov r17=r0
-    mov r22=r0
-    mov r23=r0
-    br.cond.sptk ia64_vmm_entry
-    br.ret.sptk  b0
+	//set up ipsr, iip, vpd.vpsr, dcr
+	// For IPSR: it/dt/rt=1, i/ic=1, si=1, vm/bn=1
+	// For DCR: all bits 0
+	bsw.0
+	;;
+	mov r21 =r13
+	adds r14=-VMM_PT_REGS_SIZE, r12
+	;;
+	movl r6=0x501008826000      // IPSR dt/rt/it:1;i/ic:1, si:1, vm/bn:1
+	movl r10=0x8000000000000000
+	adds r16=PT(CR_IIP), r14
+	adds r20=PT(R1), r14
+	;;
+	rsm psr.ic | psr.i
+	;;
+	srlz.i
+	;;
+	mov ar.rsc = 0
+	;;
+	flushrs
+	;;
+	mov ar.bspstore = 0
+	// clear BSPSTORE
+	;;
+	mov cr.ipsr=r6
+	mov cr.ifs=r10
+	ld8 r4 = [r16] // Set init iip for first run.
+	ld8 r1 = [r20]
+	;;
+	mov cr.iip=r4
+	adds r16=VMM_VPD_BASE_OFFSET,r13
+	;;
+	ld8 r18=[r16]
+	;;
+	adds r19=VMM_VPD_VPSR_OFFSET,r18
+	;;
+	ld8 r19=[r19]
+	mov r17=r0
+	mov r22=r0
+	mov r23=r0
+	br.cond.sptk ia64_vmm_entry
+	br.ret.sptk  b0
 END(vmm_reset_entry)

From df203ec9a77a7236cb90456664d714423b98a977 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 23 Nov 2008 18:08:57 +0200
Subject: [PATCH 077/140] KVM: VMX: Conditionally request interrupt window
 after injecting irq

If we're injecting an interrupt, and another one is pending, request
an interrupt window notification so we don't have excess latency on the
second interrupt.

This shouldn't happen in practice since an EOI will be issued, giving a second
chance to request an interrupt window, but...

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5958a7823f4..7ea485543cf8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3304,6 +3304,8 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.interrupt.pending) {
 		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
 		kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
+		if (kvm_cpu_has_interrupt(vcpu))
+			enable_irq_window(vcpu);
 	}
 }
 

From 423cd25a5ade17b8a5cc85e6f0a0f37028d2c4a2 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Mon, 24 Nov 2008 15:45:23 -0200
Subject: [PATCH 078/140] x86: KVM guest: sign kvmclock as paravirt

Currently, we only set the KVM paravirt signature in case
of CONFIG_KVM_GUEST. However, it is possible to have it turned
off, while CONFIG_KVM_CLOCK is turned on. This is also a paravirt
case, and should be shown accordingly.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index e169ae9b6a62..b38e801014e3 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -194,5 +194,7 @@ void __init kvmclock_init(void)
 #endif
 		kvm_get_preset_lpj();
 		clocksource_register(&kvm_clock);
+		pv_info.paravirt_enabled = 1;
+		pv_info.name = "KVM";
 	}
 }

From 342ffb93006e537fb8cb215b923ce69943a1e820 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:49 +0800
Subject: [PATCH 079/140] KVM: Move ack notifier register and IRQ sourcd ID
 request

Distinguish common part for device assignment and INTx part, perparing for
refactor later.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4727c08da2e9..8966fd13e848 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -192,16 +192,31 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		return -EINVAL;
 	}
 
-	if (match->irq_requested) {
+	if (!match->irq_requested) {
+		INIT_WORK(&match->interrupt_work,
+				kvm_assigned_dev_interrupt_work_handler);
+		if (irqchip_in_kernel(kvm)) {
+			/* Register ack nofitier */
+			match->ack_notifier.gsi = -1;
+			match->ack_notifier.irq_acked =
+					kvm_assigned_dev_ack_irq;
+			kvm_register_irq_ack_notifier(kvm,
+					&match->ack_notifier);
+
+			/* Request IRQ source ID */
+			r = kvm_request_irq_source_id(kvm);
+			if (r < 0)
+				goto out_release;
+			else
+				match->irq_source_id = r;
+		}
+	} else {
 		match->guest_irq = assigned_irq->guest_irq;
 		match->ack_notifier.gsi = assigned_irq->guest_irq;
 		mutex_unlock(&kvm->lock);
 		return 0;
 	}
 
-	INIT_WORK(&match->interrupt_work,
-		  kvm_assigned_dev_interrupt_work_handler);
-
 	if (irqchip_in_kernel(kvm)) {
 		if (!capable(CAP_SYS_RAWIO)) {
 			r = -EPERM;
@@ -214,13 +229,6 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 			match->host_irq = match->dev->irq;
 		match->guest_irq = assigned_irq->guest_irq;
 		match->ack_notifier.gsi = assigned_irq->guest_irq;
-		match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-		kvm_register_irq_ack_notifier(kvm, &match->ack_notifier);
-		r = kvm_request_irq_source_id(kvm);
-		if (r < 0)
-			goto out_release;
-		else
-			match->irq_source_id = r;
 
 		/* Even though this is PCI, we don't want to use shared
 		 * interrupts. Sharing host devices with guest-assigned devices

From 00e3ed39e2e25ffb3417ce1bec8f4b78ed4b85e7 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:50 +0800
Subject: [PATCH 080/140] KVM: Separate update irq to a single function

Separate INTx enabling part to a independence function, so that we can add MSI
enabling part easily.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 68 +++++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8966fd13e848..ef2f03cf42c0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -176,6 +176,41 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 	}
 }
 
+static int assigned_device_update_intx(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *adev,
+			struct kvm_assigned_irq *airq)
+{
+	if (adev->irq_requested) {
+		adev->guest_irq = airq->guest_irq;
+		adev->ack_notifier.gsi = airq->guest_irq;
+		return 0;
+	}
+
+	if (irqchip_in_kernel(kvm)) {
+		if (!capable(CAP_SYS_RAWIO))
+			return -EPERM;
+
+		if (airq->host_irq)
+			adev->host_irq = airq->host_irq;
+		else
+			adev->host_irq = adev->dev->irq;
+		adev->guest_irq = airq->guest_irq;
+		adev->ack_notifier.gsi = airq->guest_irq;
+
+		/* Even though this is PCI, we don't want to use shared
+		 * interrupts. Sharing host devices with guest-assigned devices
+		 * on the same interrupt line is not a happy situation: there
+		 * are going to be long delays in accepting, acking, etc.
+		 */
+		if (request_irq(adev->host_irq, kvm_assigned_dev_intr,
+				0, "kvm_assigned_intx_device", (void *)adev))
+			return -EIO;
+	}
+
+	adev->irq_requested = true;
+	return 0;
+}
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -210,39 +245,12 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 			else
 				match->irq_source_id = r;
 		}
-	} else {
-		match->guest_irq = assigned_irq->guest_irq;
-		match->ack_notifier.gsi = assigned_irq->guest_irq;
-		mutex_unlock(&kvm->lock);
-		return 0;
 	}
 
-	if (irqchip_in_kernel(kvm)) {
-		if (!capable(CAP_SYS_RAWIO)) {
-			r = -EPERM;
-			goto out_release;
-		}
+	r = assigned_device_update_intx(kvm, match, assigned_irq);
+	if (r)
+		goto out_release;
 
-		if (assigned_irq->host_irq)
-			match->host_irq = assigned_irq->host_irq;
-		else
-			match->host_irq = match->dev->irq;
-		match->guest_irq = assigned_irq->guest_irq;
-		match->ack_notifier.gsi = assigned_irq->guest_irq;
-
-		/* Even though this is PCI, we don't want to use shared
-		 * interrupts. Sharing host devices with guest-assigned devices
-		 * on the same interrupt line is not a happy situation: there
-		 * are going to be long delays in accepting, acking, etc.
-		 */
-		if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0,
-				"kvm_assigned_device", (void *)match)) {
-			r = -EIO;
-			goto out_release;
-		}
-	}
-
-	match->irq_requested = true;
 	mutex_unlock(&kvm->lock);
 	return r;
 out_release:

From 4f906c19ae29397409bedabf7bbe5cb42ad90332 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:51 +0800
Subject: [PATCH 081/140] KVM: Replace irq_requested with more generic
 irq_requested_type

Separate guest irq type and host irq type, for we can support guest using INTx
with host using MSI (but not opposite combination).

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 4 +++-
 virt/kvm/kvm_main.c      | 9 +++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3a0fb77d1f6a..c3d4b96a08fa 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -307,7 +307,9 @@ struct kvm_assigned_dev_kernel {
 	int host_devfn;
 	int host_irq;
 	int guest_irq;
-	int irq_requested;
+#define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
+#define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
+	unsigned long irq_requested_type;
 	int irq_source_id;
 	struct pci_dev *dev;
 	struct kvm *kvm;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ef2f03cf42c0..638de47e167d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -140,7 +140,7 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
 				     *assigned_dev)
 {
-	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested)
+	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested_type)
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
@@ -180,7 +180,7 @@ static int assigned_device_update_intx(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *adev,
 			struct kvm_assigned_irq *airq)
 {
-	if (adev->irq_requested) {
+	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX) {
 		adev->guest_irq = airq->guest_irq;
 		adev->ack_notifier.gsi = airq->guest_irq;
 		return 0;
@@ -207,7 +207,8 @@ static int assigned_device_update_intx(struct kvm *kvm,
 			return -EIO;
 	}
 
-	adev->irq_requested = true;
+	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
+				   KVM_ASSIGNED_DEV_HOST_INTX;
 	return 0;
 }
 
@@ -227,7 +228,7 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		return -EINVAL;
 	}
 
-	if (!match->irq_requested) {
+	if (!match->irq_requested_type) {
 		INIT_WORK(&match->interrupt_work,
 				kvm_assigned_dev_interrupt_work_handler);
 		if (irqchip_in_kernel(kvm)) {

From fbac7818d8fba7e1df9f4b209777f3b67b953dd3 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:52 +0800
Subject: [PATCH 082/140] KVM: Clean up assigned_device_update_irq

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 638de47e167d..2089f8b68a16 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -180,11 +180,11 @@ static int assigned_device_update_intx(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *adev,
 			struct kvm_assigned_irq *airq)
 {
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX) {
-		adev->guest_irq = airq->guest_irq;
-		adev->ack_notifier.gsi = airq->guest_irq;
+	adev->guest_irq = airq->guest_irq;
+	adev->ack_notifier.gsi = airq->guest_irq;
+
+	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
 		return 0;
-	}
 
 	if (irqchip_in_kernel(kvm)) {
 		if (!capable(CAP_SYS_RAWIO))
@@ -194,8 +194,6 @@ static int assigned_device_update_intx(struct kvm *kvm,
 			adev->host_irq = airq->host_irq;
 		else
 			adev->host_irq = adev->dev->irq;
-		adev->guest_irq = airq->guest_irq;
-		adev->ack_notifier.gsi = airq->guest_irq;
 
 		/* Even though this is PCI, we don't want to use shared
 		 * interrupts. Sharing host devices with guest-assigned devices

From 0937c48d075ddd59ae2c12a6fa8308b9c7a63753 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:53 +0800
Subject: [PATCH 083/140] KVM: Add fields for MSI device assignment

Prepared for kvm_arch_assigned_device_msi_dispatch().

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h      | 7 +++++++
 include/linux/kvm_host.h | 4 ++++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 44fd7fa0af2b..bb283c388a24 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -507,10 +507,17 @@ struct kvm_assigned_irq {
 	__u32 guest_irq;
 	__u32 flags;
 	union {
+		struct {
+			__u32 addr_lo;
+			__u32 addr_hi;
+			__u32 data;
+		} guest_msi;
 		__u32 reserved[12];
 	};
 };
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c3d4b96a08fa..8091a4d90ddf 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,7 @@
 #include <linux/mm.h>
 #include <linux/preempt.h>
 #include <linux/marker.h>
+#include <linux/msi.h>
 #include <asm/signal.h>
 
 #include <linux/kvm.h>
@@ -307,8 +308,11 @@ struct kvm_assigned_dev_kernel {
 	int host_devfn;
 	int host_irq;
 	int guest_irq;
+	struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
+#define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
+#define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	struct pci_dev *dev;

From 68b76f51675809c8ce200a86276c3c7266f17a64 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:54 +0800
Subject: [PATCH 084/140] KVM: Export ioapic_get_delivery_bitmask

It would be used for MSI in device assignment, for MSI dispatch.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/ioapic.c | 7 ++++---
 virt/kvm/ioapic.h | 2 ++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c8f939c55075..23b81cf242af 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -153,8 +153,8 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 	kvm_vcpu_kick(vcpu);
 }
 
-static u32 ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				       u8 dest_mode)
+u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				    u8 dest_mode)
 {
 	u32 mask = 0;
 	int i;
@@ -208,7 +208,8 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		     "vector=%x trig_mode=%x\n",
 		     dest, dest_mode, delivery_mode, vector, trig_mode);
 
-	deliver_bitmask = ioapic_get_delivery_bitmask(ioapic, dest, dest_mode);
+	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
+							  dest_mode);
 	if (!deliver_bitmask) {
 		ioapic_debug("no target on destination\n");
 		return 0;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index cd7ae7691c9d..49c9581d2586 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -85,5 +85,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
+u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				u8 dest_mode);
 
 #endif

From f64769eb05565c74d7fce6fa75d65924f9cdaf79 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:55 +0800
Subject: [PATCH 085/140] KVM: Add assigned_device_msi_dispatch()

The function is used to dispatch MSI to lapic according to MSI message
address and message data.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 55 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2089f8b68a16..228c1d18a614 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -47,6 +47,10 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
+#ifdef CONFIG_X86
+#include <asm/msidef.h>
+#endif
+
 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
 #include "coalesced_mmio.h"
 #endif
@@ -78,6 +82,57 @@ static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 bool kvm_rebooting;
 
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
+
+#ifdef CONFIG_X86
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev)
+{
+	int vcpu_id;
+	struct kvm_vcpu *vcpu;
+	struct kvm_ioapic *ioapic = ioapic_irqchip(dev->kvm);
+	int dest_id = (dev->guest_msi.address_lo & MSI_ADDR_DEST_ID_MASK)
+			>> MSI_ADDR_DEST_ID_SHIFT;
+	int vector = (dev->guest_msi.data & MSI_DATA_VECTOR_MASK)
+			>> MSI_DATA_VECTOR_SHIFT;
+	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+				(unsigned long *)&dev->guest_msi.address_lo);
+	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+				(unsigned long *)&dev->guest_msi.data);
+	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
+				(unsigned long *)&dev->guest_msi.data);
+	u32 deliver_bitmask;
+
+	BUG_ON(!ioapic);
+
+	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+				dest_id, dest_mode);
+	/* IOAPIC delivery mode value is the same as MSI here */
+	switch (delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
+				deliver_bitmask);
+		if (vcpu != NULL)
+			kvm_apic_set_irq(vcpu, vector, trig_mode);
+		else
+			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
+		break;
+	case IOAPIC_FIXED:
+		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+			if (!(deliver_bitmask & (1 << vcpu_id)))
+				continue;
+			deliver_bitmask &= ~(1 << vcpu_id);
+			vcpu = ioapic->kvm->vcpus[vcpu_id];
+			if (vcpu)
+				kvm_apic_set_irq(vcpu, vector, trig_mode);
+		}
+		break;
+	default:
+		printk(KERN_INFO "kvm: unsupported MSI delivery mode\n");
+	}
+}
+#else
+static void assigned_device_msi_dispatch(struct kvm_assigned_dev_kernel *dev) {}
+#endif
+
 static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
 						      int assigned_dev_id)
 {

From 6b9cc7fd469869bed38831c5adac3f59dc25eaf5 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:56 +0800
Subject: [PATCH 086/140] KVM: Enable MSI for device assignment

We enable guest MSI and host MSI support in this patch. The userspace want to
enable MSI should set KVM_DEV_IRQ_ASSIGN_ENABLE_MSI in the assigned_irq's flag.
Function would return -ENOTTY if can't enable MSI, userspace shouldn't set MSI
Enable bit when KVM_ASSIGN_IRQ return -ENOTTY with
KVM_DEV_IRQ_ASSIGN_ENABLE_MSI.

Userspace can tell the support of MSI device from #ifdef KVM_CAP_DEVICE_MSI.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h |  3 ++
 virt/kvm/kvm_main.c | 81 +++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index bb283c388a24..0997e6f5490c 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -392,6 +392,9 @@ struct kvm_trace_rec {
 #endif
 #define KVM_CAP_IOMMU 18
 #define KVM_CAP_NMI 19
+#if defined(CONFIG_X86)
+#define KVM_CAP_DEVICE_MSI 20
+#endif
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 228c1d18a614..bf36ae9ae7df 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -159,9 +159,15 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm,
-		    assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_INTX)
+		kvm_set_irq(assigned_dev->kvm,
+			    assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+	else if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_GUEST_MSI) {
+		assigned_device_msi_dispatch(assigned_dev);
+		enable_irq(assigned_dev->host_irq);
+	}
 	mutex_unlock(&assigned_dev->kvm->lock);
 	kvm_put_kvm(assigned_dev->kvm);
 }
@@ -197,6 +203,8 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 {
 	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested_type)
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+		pci_disable_msi(assigned_dev->dev);
 
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
 	kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
@@ -242,6 +250,11 @@ static int assigned_device_update_intx(struct kvm *kvm,
 		return 0;
 
 	if (irqchip_in_kernel(kvm)) {
+		if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) {
+			free_irq(adev->host_irq, (void *)kvm);
+			pci_disable_msi(adev->dev);
+		}
+
 		if (!capable(CAP_SYS_RAWIO))
 			return -EPERM;
 
@@ -265,6 +278,41 @@ static int assigned_device_update_intx(struct kvm *kvm,
 	return 0;
 }
 
+#ifdef CONFIG_X86
+static int assigned_device_update_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *adev,
+			struct kvm_assigned_irq *airq)
+{
+	int r;
+
+	/* x86 don't care upper address of guest msi message addr */
+	adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
+	adev->guest_msi.data = airq->guest_msi.data;
+	adev->ack_notifier.gsi = -1;
+
+	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+		return 0;
+
+	if (irqchip_in_kernel(kvm)) {
+		if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
+			free_irq(adev->host_irq, (void *)adev);
+
+		r = pci_enable_msi(adev->dev);
+		if (r)
+			return r;
+
+		adev->host_irq = adev->dev->irq;
+		if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
+				"kvm_assigned_msi_device", (void *)adev))
+			return -EIO;
+	}
+
+	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI |
+				   KVM_ASSIGNED_DEV_HOST_MSI;
+	return 0;
+}
+#endif
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -301,9 +349,30 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	r = assigned_device_update_intx(kvm, match, assigned_irq);
-	if (r)
-		goto out_release;
+	if (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
+#ifdef CONFIG_X86
+		r = assigned_device_update_msi(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to enable "
+					"MSI device!\n");
+			goto out_release;
+		}
+#else
+		r = -ENOTTY;
+#endif
+	} else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
+		/* Host device IRQ 0 means don't support INTx */
+		printk(KERN_WARNING "kvm: wait device to enable MSI!\n");
+		r = 0;
+	} else {
+		/* Non-sharing INTx mode */
+		r = assigned_device_update_intx(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to enable "
+					"INTx device!\n");
+			goto out_release;
+		}
+	}
 
 	mutex_unlock(&kvm->lock);
 	return r;

From 5319c662522db8995ff9276ba9d80549c64b294a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 24 Nov 2008 14:32:57 +0800
Subject: [PATCH 087/140] KVM: MSI to INTx translate

Now we use MSI as default one, and translate MSI to INTx when guest need
INTx rather than MSI. For legacy device, we provide support for non-sharing
host IRQ.

Provide a parameter msi2intx for this method. The value is true by default in
x86 architecture.

We can't guarantee this mode can work on every device, but for most of us
tested, it works. If your device encounter some trouble with this mode, you can
try set msi2intx modules parameter to 0. If the device is OK with msi2intx=0,
then please report it to KVM mailing list or me. We may prepare a blacklist for
the device that can't work in this mode.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 68 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 15 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index bf36ae9ae7df..54d25e6f6d2d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -64,6 +64,9 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
+static int msi2intx = 1;
+module_param(msi2intx, bool, 0);
+
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
@@ -250,7 +253,8 @@ static int assigned_device_update_intx(struct kvm *kvm,
 		return 0;
 
 	if (irqchip_in_kernel(kvm)) {
-		if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) {
+		if (!msi2intx &&
+		    adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) {
 			free_irq(adev->host_irq, (void *)kvm);
 			pci_disable_msi(adev->dev);
 		}
@@ -285,21 +289,33 @@ static int assigned_device_update_msi(struct kvm *kvm,
 {
 	int r;
 
-	/* x86 don't care upper address of guest msi message addr */
-	adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
-	adev->guest_msi.data = airq->guest_msi.data;
-	adev->ack_notifier.gsi = -1;
+	if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
+		/* x86 don't care upper address of guest msi message addr */
+		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
+		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
+		adev->guest_msi.address_lo = airq->guest_msi.addr_lo;
+		adev->guest_msi.data = airq->guest_msi.data;
+		adev->ack_notifier.gsi = -1;
+	} else if (msi2intx) {
+		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
+		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
+		adev->guest_irq = airq->guest_irq;
+		adev->ack_notifier.gsi = airq->guest_irq;
+	}
 
 	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
 		return 0;
 
 	if (irqchip_in_kernel(kvm)) {
-		if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
-			free_irq(adev->host_irq, (void *)adev);
+		if (!msi2intx) {
+			if (adev->irq_requested_type &
+					KVM_ASSIGNED_DEV_HOST_INTX)
+				free_irq(adev->host_irq, (void *)adev);
 
-		r = pci_enable_msi(adev->dev);
-		if (r)
-			return r;
+			r = pci_enable_msi(adev->dev);
+			if (r)
+				return r;
+		}
 
 		adev->host_irq = adev->dev->irq;
 		if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
@@ -307,8 +323,10 @@ static int assigned_device_update_msi(struct kvm *kvm,
 			return -EIO;
 	}
 
-	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI |
-				   KVM_ASSIGNED_DEV_HOST_MSI;
+	if (!msi2intx)
+		adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI;
+
+	adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI;
 	return 0;
 }
 #endif
@@ -346,10 +364,19 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				goto out_release;
 			else
 				match->irq_source_id = r;
+
+#ifdef CONFIG_X86
+			/* Determine host device irq type, we can know the
+			 * result from dev->msi_enabled */
+			if (msi2intx)
+				pci_enable_msi(match->dev);
+#endif
 		}
 	}
 
-	if (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
+	if ((!msi2intx &&
+	     (assigned_irq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI)) ||
+	    (msi2intx && match->dev->msi_enabled)) {
 #ifdef CONFIG_X86
 		r = assigned_device_update_msi(kvm, match, assigned_irq);
 		if (r) {
@@ -362,8 +389,16 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 #endif
 	} else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
 		/* Host device IRQ 0 means don't support INTx */
-		printk(KERN_WARNING "kvm: wait device to enable MSI!\n");
-		r = 0;
+		if (!msi2intx) {
+			printk(KERN_WARNING
+			       "kvm: wait device to enable MSI!\n");
+			r = 0;
+		} else {
+			printk(KERN_WARNING
+			       "kvm: failed to enable MSI device!\n");
+			r = -ENOTTY;
+			goto out_release;
+		}
 	} else {
 		/* Non-sharing INTx mode */
 		r = assigned_device_update_intx(kvm, match, assigned_irq);
@@ -2209,6 +2244,9 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
+#ifndef CONFIG_X86
+	msi2intx = 0;
+#endif
 
 	return 0;
 

From ecc5589f19a52e7e6501fe449047b19087ae11bb Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 25 Nov 2008 15:58:07 +0100
Subject: [PATCH 088/140] KVM: MMU: optimize set_spte for page sync

The write protect verification in set_spte is unnecessary for page sync.

Its guaranteed that, if the unsync spte was writable, the target page
does not have a write protected shadow (if it had, the spte would have
been write protected under mmu_lock by rmap_write_protect before).

Same reasoning applies to mark_page_dirty: the gfn has been marked as
dirty via the pagefault path.

The cost of hash table and memslot lookups are quite significant if the
workload is pagetable write intensive resulting in increased mmu_lock
contention.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fa3486d64078..dd20b199a7c0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1593,6 +1593,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 
 		spte |= PT_WRITABLE_MASK;
 
+		/*
+		 * Optimization: for pte sync, if spte was writable the hash
+		 * lookup is unnecessary (and expensive). Write protection
+		 * is responsibility of mmu_get_page / kvm_sync_page.
+		 * Same reasoning can be applied to dirty page accounting.
+		 */
+		if (!can_unsync && is_writeble_pte(*shadow_pte))
+			goto set_pte;
+
 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
 			pgprintk("%s: found shadow page for %lx, marking ro\n",
 				 __func__, gfn);

From dda96d8f1b3de692cce09969ce28fe22e58e5acf Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 26 Nov 2008 15:14:10 +0200
Subject: [PATCH 089/140] KVM: x86 emulator: reduce duplication in one operand
 emulation thunks

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 66 +++++++++++++-------------------------
 1 file changed, 23 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8f60ace13874..5f87d3e59195 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -359,6 +359,12 @@ static u16 group2_table[] = {
 	"andl %"_msk",%"_LO32 _tmp"; "		\
 	"orl  %"_LO32 _tmp",%"_sav"; "
 
+#ifdef CONFIG_X86_64
+#define ON64(x) x
+#else
+#define ON64(x)
+#endif
+
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
 	do { 								    \
@@ -425,42 +431,27 @@ static u16 group2_table[] = {
 	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
 			     "w", "r", _LO32, "r", "", "r")
 
-/* Instruction has only one explicit operand (no source operand). */
-#define emulate_1op(_op, _dst, _eflags)                                    \
+#define __emulate_1op(_op, _dst, _eflags, _suffix)			\
 	do {								\
 		unsigned long _tmp;					\
 									\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "3", "2")			\
+			_op _suffix " %1; "				\
+			_POST_EFLAGS("0", "3", "2")			\
+			: "=m" (_eflags), "+m" ((_dst).val),		\
+			  "=&r" (_tmp)					\
+			: "i" (EFLAGS_MASK));				\
+	} while (0)
+
+/* Instruction has only one explicit operand (no source operand). */
+#define emulate_1op(_op, _dst, _eflags)                                    \
+	do {								\
 		switch ((_dst).bytes) {				        \
-		case 1:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"b %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 2:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"w %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 4:							\
-			__asm__ __volatile__ (				\
-				_PRE_EFLAGS("0", "3", "2")		\
-				_op"l %1; "				\
-				_POST_EFLAGS("0", "3", "2")		\
-				: "=m" (_eflags), "=m" ((_dst).val),	\
-				  "=&r" (_tmp)				\
-				: "i" (EFLAGS_MASK));			\
-			break;						\
-		case 8:							\
-			__emulate_1op_8byte(_op, _dst, _eflags);	\
-			break;						\
+		case 1:	__emulate_1op(_op, _dst, _eflags, "b"); break;	\
+		case 2:	__emulate_1op(_op, _dst, _eflags, "w"); break;	\
+		case 4:	__emulate_1op(_op, _dst, _eflags, "l"); break;	\
+		case 8:	ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \
 		}							\
 	} while (0)
 
@@ -476,19 +467,8 @@ static u16 group2_table[] = {
 			: _qy ((_src).val), "i" (EFLAGS_MASK));		\
 	} while (0)
 
-#define __emulate_1op_8byte(_op, _dst, _eflags)                           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0", "3", "2")			  \
-			_op"q %1; "					  \
-			_POST_EFLAGS("0", "3", "2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: "i" (EFLAGS_MASK));				  \
-	} while (0)
-
 #elif defined(__i386__)
 #define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#define __emulate_1op_8byte(_op, _dst, _eflags)
 #endif				/* __i386__ */
 
 /* Fetch next part of the instruction being emulated. */

From 6b7ad61ffb9ca110add6f7fb36cc8a4dd89696a4 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Wed, 26 Nov 2008 15:30:45 +0200
Subject: [PATCH 090/140] KVM: x86 emulator: consolidate emulation of two
 operand instructions

No need to repeat the same assembly block over and over.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 79 ++++++++++++++------------------------
 1 file changed, 28 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 5f87d3e59195..a11af6f74d68 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -365,49 +365,42 @@ static u16 group2_table[] = {
 #define ON64(x)
 #endif
 
+#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix)	\
+	do {								\
+		__asm__ __volatile__ (					\
+			_PRE_EFLAGS("0", "4", "2")			\
+			_op _suffix " %"_x"3,%1; "			\
+			_POST_EFLAGS("0", "4", "2")			\
+			: "=m" (_eflags), "=m" ((_dst).val),		\
+			  "=&r" (_tmp)					\
+			: _y ((_src).val), "i" (EFLAGS_MASK));		\
+	} while (0);
+
+
 /* Raw emulation: instruction has two explicit operands. */
 #define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \
-	do { 								    \
-		unsigned long _tmp;					    \
-									    \
-		switch ((_dst).bytes) {					    \
-		case 2:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0", "4", "2")		    \
-				_op"w %"_wx"3,%1; "			    \
-				_POST_EFLAGS("0", "4", "2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),        \
-				  "=&r" (_tmp)				    \
-				: _wy ((_src).val), "i" (EFLAGS_MASK));     \
-			break;						    \
-		case 4:							    \
-			__asm__ __volatile__ (				    \
-				_PRE_EFLAGS("0", "4", "2")		    \
-				_op"l %"_lx"3,%1; "			    \
-				_POST_EFLAGS("0", "4", "2")		    \
-				: "=m" (_eflags), "=m" ((_dst).val),	    \
-				  "=&r" (_tmp)				    \
-				: _ly ((_src).val), "i" (EFLAGS_MASK));     \
-			break;						    \
-		case 8:							    \
-			__emulate_2op_8byte(_op, _src, _dst,		    \
-					    _eflags, _qx, _qy);		    \
-			break;						    \
-		}							    \
+	do {								\
+		unsigned long _tmp;					\
+									\
+		switch ((_dst).bytes) {					\
+		case 2:							\
+			____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \
+			break;						\
+		case 4:							\
+			____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \
+			break;						\
+		case 8:							\
+			ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \
+			break;						\
+		}							\
 	} while (0)
 
 #define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \
 	do {								     \
-		unsigned long __tmp;					     \
+		unsigned long _tmp;					     \
 		switch ((_dst).bytes) {				             \
 		case 1:							     \
-			__asm__ __volatile__ (				     \
-				_PRE_EFLAGS("0", "4", "2")		     \
-				_op"b %"_bx"3,%1; "			     \
-				_POST_EFLAGS("0", "4", "2")		     \
-				: "=m" (_eflags), "=m" ((_dst).val),	     \
-				  "=&r" (__tmp)				     \
-				: _by ((_src).val), "i" (EFLAGS_MASK));      \
+			____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b");  \
 			break;						     \
 		default:						     \
 			__emulate_2op_nobyte(_op, _src, _dst, _eflags,	     \
@@ -455,22 +448,6 @@ static u16 group2_table[] = {
 		}							\
 	} while (0)
 
-/* Emulate an instruction with quadword operands (x86/64 only). */
-#if defined(CONFIG_X86_64)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)           \
-	do {								  \
-		__asm__ __volatile__ (					  \
-			_PRE_EFLAGS("0", "4", "2")			  \
-			_op"q %"_qx"3,%1; "				  \
-			_POST_EFLAGS("0", "4", "2")			  \
-			: "=m" (_eflags), "=m" ((_dst).val), "=&r" (_tmp) \
-			: _qy ((_src).val), "i" (EFLAGS_MASK));		\
-	} while (0)
-
-#elif defined(__i386__)
-#define __emulate_2op_8byte(_op, _src, _dst, _eflags, _qx, _qy)
-#endif				/* __i386__ */
-
 /* Fetch next part of the instruction being emulated. */
 #define insn_fetch(_type, _size, _eip)                                  \
 ({	unsigned long _x;						\

From d329c035e754156ffabcb64ff75d05bb8e2ddbf5 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 26 Nov 2008 14:50:27 +0100
Subject: [PATCH 091/140] KVM: s390: Fix refcounting and allow module unload

Currently it is impossible to unload the kvm module on s390.
This patch fixes kvm_arch_destroy_vm to release all cpus.
This make it possible to unload the module.

In addition we stop messing with the module refcount in arch code.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 8b00eb2ddf57..3db9e5d45a67 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -185,8 +185,6 @@ struct kvm *kvm_arch_create_vm(void)
 	debug_register_view(kvm->arch.dbf, &debug_sprintf_view);
 	VM_EVENT(kvm, 3, "%s", "vm created");
 
-	try_module_get(THIS_MODULE);
-
 	return kvm;
 out_nodbf:
 	free_page((unsigned long)(kvm->arch.sca));
@@ -196,13 +194,32 @@ out_nokvm:
 	return ERR_PTR(rc);
 }
 
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	free_page((unsigned long)(vcpu->arch.sie_block));
+	kfree(vcpu);
+}
+
+static void kvm_free_vcpus(struct kvm *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm_arch_vcpu_destroy(kvm->vcpus[i]);
+			kvm->vcpus[i] = NULL;
+		}
+	}
+}
+
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
-	debug_unregister(kvm->arch.dbf);
+	kvm_free_vcpus(kvm);
 	kvm_free_physmem(kvm);
 	free_page((unsigned long)(kvm->arch.sca));
+	debug_unregister(kvm->arch.dbf);
 	kfree(kvm);
-	module_put(THIS_MODULE);
 }
 
 /* Section: vcpu related */
@@ -308,8 +325,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	VM_EVENT(kvm, 3, "create cpu %d at %p, sie block at %p", id, vcpu,
 		 vcpu->arch.sie_block);
 
-	try_module_get(THIS_MODULE);
-
 	return vcpu;
 out_free_cpu:
 	kfree(vcpu);
@@ -317,14 +332,6 @@ out_nomem:
 	return ERR_PTR(rc);
 }
 
-void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
-	VCPU_EVENT(vcpu, 3, "%s", "destroy cpu");
-	free_page((unsigned long)(vcpu->arch.sie_block));
-	kfree(vcpu);
-	module_put(THIS_MODULE);
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 	/* kvm common code refers to this, but never calls it */

From 6692cef30b7caf7525ae99670cddbaf28f1f9d40 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Wed, 26 Nov 2008 14:51:08 +0100
Subject: [PATCH 092/140] KVM: s390: Fix memory leak of vcpu->run

The s390 backend of kvm never calls kvm_vcpu_uninit. This causes
a memory leak of vcpu->run pages.
Lets call kvm_vcpu_uninit in kvm_arch_vcpu_destroy to free
the vcpu->run.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Acked-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 3db9e5d45a67..76f05ddaef10 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -198,6 +198,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
 	free_page((unsigned long)(vcpu->arch.sie_block));
+	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu);
 }
 
@@ -230,8 +231,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
-	/* kvm common code refers to this, but does'nt call it */
-	BUG();
+	/* Nothing todo */
 }
 
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)

From b82091824ee4970adf92d5cd6d57b12273171625 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 26 Nov 2008 19:59:06 +0800
Subject: [PATCH 093/140] KVM: Prevent trace call into unloaded module text

Add marker_synchronize_unregister() before module unloading.
This prevents possible trace calls into unloaded module text.

Signed-off-by: Wu Fengguang <wfg@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/virt/kvm/kvm_trace.c b/virt/kvm/kvm_trace.c
index 41dcc845f78c..f59874446440 100644
--- a/virt/kvm/kvm_trace.c
+++ b/virt/kvm/kvm_trace.c
@@ -252,6 +252,7 @@ void kvm_trace_cleanup(void)
 			struct kvm_trace_probe *p = &kvm_trace_probes[i];
 			marker_probe_unregister(p->name, p->probe_func, p);
 		}
+		marker_synchronize_unregister();
 
 		relay_close(kt->rchan);
 		debugfs_remove(kt->lost_file);

From faa5a3ae39483aefc46a78299c811194f953af27 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 Nov 2008 17:36:41 +0200
Subject: [PATCH 094/140] KVM: x86 emulator: Extract 'pop' sequence into a
 function

Switch 'pop r/m' instruction to use the new function.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index a11af6f74d68..2555762f4b42 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1057,20 +1057,33 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 					       c->regs[VCPU_REGS_RSP]);
 }
 
-static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
-				struct x86_emulate_ops *ops)
+static int emulate_pop(struct x86_emulate_ctxt *ctxt,
+		       struct x86_emulate_ops *ops)
 {
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
 	rc = ops->read_std(register_address(c, ss_base(ctxt),
 					    c->regs[VCPU_REGS_RSP]),
-			   &c->dst.val, c->dst.bytes, ctxt->vcpu);
+			   &c->src.val, c->src.bytes, ctxt->vcpu);
 	if (rc != 0)
 		return rc;
 
-	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->dst.bytes);
+	register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.bytes);
+	return rc;
+}
 
+static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
+{
+	struct decode_cache *c = &ctxt->decode;
+	int rc;
+
+	c->src.bytes = c->dst.bytes;
+	rc = emulate_pop(ctxt, ops);
+	if (rc != 0)
+		return rc;
+	c->dst.val = c->src.val;
 	return 0;
 }
 

From 781d0edc5fc5cfe7491a0c5081734e62f6dc66ee Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 Nov 2008 18:00:28 +0200
Subject: [PATCH 095/140] KVM: x86 emulator: allow pop from mmio

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 2555762f4b42..70242f5f0964 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1063,9 +1063,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 
-	rc = ops->read_std(register_address(c, ss_base(ctxt),
-					    c->regs[VCPU_REGS_RSP]),
-			   &c->src.val, c->src.bytes, ctxt->vcpu);
+	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
+						 c->regs[VCPU_REGS_RSP]),
+				&c->src.val, c->src.bytes, ctxt->vcpu);
 	if (rc != 0)
 		return rc;
 

From 8a09b6877f3100207b3572e7e12ea796493fe914 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Thu, 27 Nov 2008 18:06:33 +0200
Subject: [PATCH 096/140] KVM: x86 emulator: switch 'pop reg' instruction to
 emulate_pop()

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 70242f5f0964..702de9869c19 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1389,14 +1389,11 @@ special_insn:
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
-		if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
-			c->regs[VCPU_REGS_RSP]), c->dst.ptr,
-			c->op_bytes, ctxt->vcpu)) != 0)
+		c->src.bytes = c->op_bytes;
+		rc = emulate_pop(ctxt, ops);
+		if (rc != 0)
 			goto done;
-
-		register_address_increment(c, &c->regs[VCPU_REGS_RSP],
-					   c->op_bytes);
-		c->dst.type = OP_NONE;	/* Disable writeback. */
+		c->dst.val = c->src.val;
 		break;
 	case 0x63:		/* movsxd */
 		if (ctxt->mode != X86EMUL_MODE_PROT64)

From cf5de4f886116871c2ae2eee53524edd741a68ae Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Fri, 28 Nov 2008 00:14:07 +0200
Subject: [PATCH 097/140] KVM: x86 emulator: fix ret emulation

'ret' did not set the operand type or size for the destination, so
writeback ignored it.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 702de9869c19..72ae86b1b131 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1650,7 +1650,9 @@ special_insn:
 		emulate_grp2(ctxt);
 		break;
 	case 0xc3: /* ret */
+		c->dst.type = OP_REG;
 		c->dst.ptr = &c->eip;
+		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
 	case 0xc6 ... 0xc7:	/* mov (sole member of Grp11) */
 	mov:

From 2b48cc75b21431037d6f902b9d583b1aff198490 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 29 Nov 2008 20:36:13 +0200
Subject: [PATCH 098/140] KVM: x86 emulator: fix popf emulation

Set operand type and size to get correct writeback behavior.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 72ae86b1b131..e8c87ccfe310 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1552,7 +1552,9 @@ special_insn:
 		emulate_push(ctxt);
 		break;
 	case 0x9d: /* popf */
+		c->dst.type = OP_REG;
 		c->dst.ptr = (unsigned long *) &ctxt->eflags;
+		c->dst.bytes = c->op_bytes;
 		goto pop_instruction;
 	case 0xa0 ... 0xa1:	/* mov */
 		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];

From f3fd92fbdb7663bd889c136842afc3851351ea8f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sat, 29 Nov 2008 20:38:12 +0200
Subject: [PATCH 099/140] KVM: Remove extraneous semicolon after do/while

Notices by Guillaume Thouvenin.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index e8c87ccfe310..69b330ba0ad0 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -374,7 +374,7 @@ static u16 group2_table[] = {
 			: "=m" (_eflags), "=m" ((_dst).val),		\
 			  "=&r" (_tmp)					\
 			: _y ((_src).val), "i" (EFLAGS_MASK));		\
-	} while (0);
+	} while (0)
 
 
 /* Raw emulation: instruction has two explicit operands. */

From e8ba5d311d0c4420e84f40ff50f83981f5864a9a Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Fri, 28 Nov 2008 17:02:42 +0100
Subject: [PATCH 100/140] KVM: fix sparse warning

Impact: make global function static

  virt/kvm/kvm_main.c:85:6: warning: symbol 'kvm_rebooting' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 54d25e6f6d2d..8dab7cedd50b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -82,7 +82,7 @@ struct dentry *kvm_debugfs_dir;
 static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
 			   unsigned long arg);
 
-bool kvm_rebooting;
+static bool kvm_rebooting;
 
 #ifdef KVM_CAP_DEVICE_ASSIGNMENT
 

From efff9e538f6bfa8ee2ca03f7e9a55d98df115186 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Fri, 28 Nov 2008 17:02:06 +0100
Subject: [PATCH 101/140] KVM: VMX: fix sparse warning

Impact: make global function static

  arch/x86/kvm/vmx.c:134:3: warning: symbol 'vmx_capability' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7ea485543cf8..e446f232588e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -128,7 +128,7 @@ static struct vmcs_config {
 	u32 vmentry_ctrl;
 } vmcs_config;
 
-struct vmx_capability {
+static struct vmx_capability {
 	u32 ept;
 	u32 vpid;
 } vmx_capability;

From 844c7a9ff404d8fc88bb77b06461644621d2c985 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 1 Dec 2008 13:57:45 +0000
Subject: [PATCH 102/140] KVM: remove the IRQ ACK notifier assertions

We will obviously never pass a NULL struct kvm_irq_ack_notifier* to
this functions. They are always embedded in the assigned device
structure, so the assertion add nothing.

The irqchip_in_kernel() assertion is very out of place - clearly
this little abstraction needs to know nothing about the upper
layer details.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/irq_comm.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 9fbbdea3d1d5..973df997ea60 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -58,9 +58,6 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi)
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian)
 {
-	/* Must be called with in-kernel IRQ chip, otherwise it's nonsense */
-	ASSERT(irqchip_in_kernel(kvm));
-	ASSERT(kian);
 	hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list);
 }
 

From fdd897e6b5253a45b633f7d334cf3e150bbaf386 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 1 Dec 2008 13:57:46 +0000
Subject: [PATCH 103/140] KVM: make kvm_unregister_irq_ack_notifier() safe

We never pass a NULL notifier pointer here, but we may well
pass a notifier struct which hasn't previously been
registered.

Guard against this by using hlist_del_init() which will
not do anything if the node hasn't been added to the list
and, when removing the node, will ensure that a subsequent
call to hlist_del_init() will be fine too.

Fixes an oops seen when an assigned device is freed before
and IRQ is assigned to it.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/irq_comm.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 973df997ea60..db75045f22f6 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -63,9 +63,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 
 void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian)
 {
-	if (!kian)
-		return;
-	hlist_del(&kian->link);
+	hlist_del_init(&kian->link);
 }
 
 /* The caller must hold kvm->lock mutex */

From f29b2673d3fc7ae38ec22922e9cdc75ee37386b5 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 1 Dec 2008 13:57:47 +0000
Subject: [PATCH 104/140] KVM: don't free an unallocated irq source id

Set assigned_dev->irq_source_id to -1 so that we can avoid freeing
a source ID which we never allocated.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8dab7cedd50b..63fd88209439 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -210,7 +210,10 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 		pci_disable_msi(assigned_dev->dev);
 
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
-	kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+
+	if (assigned_dev->irq_source_id != -1)
+		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+	assigned_dev->irq_source_id = -1;
 
 	if (cancel_work_sync(&assigned_dev->interrupt_work))
 		/* We had pending work. That means we will have to take
@@ -466,7 +469,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->host_busnr = assigned_dev->busnr;
 	match->host_devfn = assigned_dev->devfn;
 	match->dev = dev;
-
+	match->irq_source_id = -1;
 	match->kvm = kvm;
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);

From 61552367b2ce5e9bea6b6af670ec80aea386f34e Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 1 Dec 2008 13:57:48 +0000
Subject: [PATCH 105/140] KVM: add KVM_USERSPACE_IRQ_SOURCE_ID assertions

Make sure kvm_request_irq_source_id() never returns
KVM_USERSPACE_IRQ_SOURCE_ID.

Likewise, check that kvm_free_irq_source_id() never accepts
KVM_USERSPACE_IRQ_SOURCE_ID.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/irq_comm.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index db75045f22f6..aa5d1e5c497e 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -72,11 +72,15 @@ int kvm_request_irq_source_id(struct kvm *kvm)
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
 	int irq_source_id = find_first_zero_bit(bitmap,
 				sizeof(kvm->arch.irq_sources_bitmap));
+
 	if (irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
 		printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
-		irq_source_id = -EFAULT;
-	} else
-		set_bit(irq_source_id, bitmap);
+		return -EFAULT;
+	}
+
+	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+	set_bit(irq_source_id, bitmap);
+
 	return irq_source_id;
 }
 
@@ -84,7 +88,9 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 {
 	int i;
 
-	if (irq_source_id <= 0 ||
+	ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+
+	if (irq_source_id < 0 ||
 	    irq_source_id >= sizeof(kvm->arch.irq_sources_bitmap)) {
 		printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
 		return;

From 4a643be8c9b8d3c1ae8f5ccd377daaa85bd57e0c Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Mon, 1 Dec 2008 13:57:49 +0000
Subject: [PATCH 106/140] KVM: split out kvm_free_assigned_irq()

Split out the logic corresponding to undoing assign_irq() and
clean it up a bit.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 63fd88209439..e41d39d6b0ef 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -200,14 +200,11 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	enable_irq(dev->host_irq);
 }
 
-static void kvm_free_assigned_device(struct kvm *kvm,
-				     struct kvm_assigned_dev_kernel
-				     *assigned_dev)
+static void kvm_free_assigned_irq(struct kvm *kvm,
+				  struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested_type)
-		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		pci_disable_msi(assigned_dev->dev);
+	if (!irqchip_in_kernel(kvm))
+		return;
 
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
 
@@ -215,12 +212,30 @@ static void kvm_free_assigned_device(struct kvm *kvm,
 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 	assigned_dev->irq_source_id = -1;
 
+	if (!assigned_dev->irq_requested_type)
+		return;
+
 	if (cancel_work_sync(&assigned_dev->interrupt_work))
 		/* We had pending work. That means we will have to take
 		 * care of kvm_put_kvm.
 		 */
 		kvm_put_kvm(kvm);
 
+	free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
+		pci_disable_msi(assigned_dev->dev);
+
+	assigned_dev->irq_requested_type = 0;
+}
+
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+				     struct kvm_assigned_dev_kernel
+				     *assigned_dev)
+{
+	kvm_free_assigned_irq(kvm, assigned_dev);
+
 	pci_reset_function(assigned_dev->dev);
 
 	pci_release_regions(assigned_dev->dev);

From 891686188f69d330f7eeeec8e6642ccfb7453106 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 2 Dec 2008 15:51:53 -0600
Subject: [PATCH 107/140] KVM: ppc: support large host pages

KVM on 440 has always been able to handle large guest mappings with 4K host
pages -- we must, since the guest kernel uses 256MB mappings.

This patch makes KVM work when the host has large pages too (tested with 64K).

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 +-
 arch/powerpc/kvm/44x_tlb.c         | 71 +++++++++++++++++++++++-------
 arch/powerpc/kvm/booke.c           | 12 ++---
 3 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 844f683302f6..5bb29267d6a6 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -52,8 +52,8 @@ extern int kvmppc_emulate_instruction(struct kvm_run *run,
 extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 
-extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn,
-                           u64 asid, u32 flags);
+extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
+                           u64 asid, u32 flags, u32 max_bytes);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index ee2461860bcf..d49dc66ab3c3 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -28,6 +28,13 @@
 
 #include "44x_tlb.h"
 
+#ifndef PPC44x_TLBE_SIZE
+#define PPC44x_TLBE_SIZE	PPC44x_TLB_4K
+#endif
+
+#define PAGE_SIZE_4K (1<<12)
+#define PAGE_MASK_4K (~(PAGE_SIZE_4K - 1))
+
 #define PPC44x_TLB_UATTR_MASK \
 	(PPC44x_TLB_U0|PPC44x_TLB_U1|PPC44x_TLB_U2|PPC44x_TLB_U3)
 #define PPC44x_TLB_USER_PERM_MASK (PPC44x_TLB_UX|PPC44x_TLB_UR|PPC44x_TLB_UW)
@@ -179,15 +186,26 @@ void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
 	vcpu_44x->shadow_tlb_mod[i] = 1;
 }
 
-/* Caller must ensure that the specified guest TLB entry is safe to insert into
- * the shadow TLB. */
-void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
-                    u32 flags)
+/**
+ * kvmppc_mmu_map -- create a host mapping for guest memory
+ *
+ * If the guest wanted a larger page than the host supports, only the first
+ * host page is mapped here and the rest are demand faulted.
+ *
+ * If the guest wanted a smaller page than the host page size, we map only the
+ * guest-size page (i.e. not a full host page mapping).
+ *
+ * Caller must ensure that the specified guest TLB entry is safe to insert into
+ * the shadow TLB.
+ */
+void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
+                    u32 flags, u32 max_bytes)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	struct page *new_page;
 	struct kvmppc_44x_tlbe *stlbe;
 	hpa_t hpaddr;
+	gfn_t gfn;
 	unsigned int victim;
 
 	/* Future optimization: don't overwrite the TLB entry containing the
@@ -198,6 +216,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	stlbe = &vcpu_44x->shadow_tlb[victim];
 
 	/* Get reference to new page. */
+	gfn = gpaddr >> PAGE_SHIFT;
 	new_page = gfn_to_page(vcpu->kvm, gfn);
 	if (is_error_page(new_page)) {
 		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
@@ -220,10 +239,25 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gfn_t gfn, u64 asid,
 	stlbe->tid = !(asid & 0xff);
 
 	/* Force TS=1 for all guest mappings. */
-	/* For now we hardcode 4KB mappings, but it will be important to
-	 * use host large pages in the future. */
-	stlbe->word0 = (gvaddr & PAGE_MASK) | PPC44x_TLB_VALID | PPC44x_TLB_TS
-	               | PPC44x_TLB_4K;
+	stlbe->word0 = PPC44x_TLB_VALID | PPC44x_TLB_TS;
+
+	if (max_bytes >= PAGE_SIZE) {
+		/* Guest mapping is larger than or equal to host page size. We can use
+		 * a "native" host mapping. */
+		stlbe->word0 |= (gvaddr & PAGE_MASK) | PPC44x_TLBE_SIZE;
+	} else {
+		/* Guest mapping is smaller than host page size. We must restrict the
+		 * size of the mapping to be at most the smaller of the two, but for
+		 * simplicity we fall back to a 4K mapping (this is probably what the
+		 * guest is using anyways). */
+		stlbe->word0 |= (gvaddr & PAGE_MASK_4K) | PPC44x_TLB_4K;
+
+		/* 'hpaddr' is a host page, which is larger than the mapping we're
+		 * inserting here. To compensate, we must add the in-page offset to the
+		 * sub-page. */
+		hpaddr |= gpaddr & (PAGE_MASK ^ PAGE_MASK_4K);
+	}
+
 	stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
 	stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
 	                                            vcpu->arch.msr & MSR_PR);
@@ -322,10 +356,8 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-	u64 eaddr;
-	u64 raddr;
+	gva_t eaddr;
 	u64 asid;
-	u32 flags;
 	struct kvmppc_44x_tlbe *tlbe;
 	unsigned int index;
 
@@ -364,15 +396,22 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	}
 
 	if (tlbe_is_host_safe(vcpu, tlbe)) {
+		gpa_t gpaddr;
+		u32 flags;
+		u32 bytes;
+
 		eaddr = get_tlb_eaddr(tlbe);
-		raddr = get_tlb_raddr(tlbe);
+		gpaddr = get_tlb_raddr(tlbe);
+
+		/* Use the advertised page size to mask effective and real addrs. */
+		bytes = get_tlb_bytes(tlbe);
+		eaddr &= ~(bytes - 1);
+		gpaddr &= ~(bytes - 1);
+
 		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
 		flags = tlbe->word2 & 0xffff;
 
-		/* Create a 4KB mapping on the host. If the guest wanted a
-		 * large page, only the first 4KB is mapped here and the rest
-		 * are mapped on the fly. */
-		kvmppc_mmu_map(vcpu, eaddr, raddr >> PAGE_SHIFT, asid, flags);
+		kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes);
 	}
 
 	KVMTRACE_5D(GTLB_WRITE, vcpu, index,
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index ec59a6768ec3..924c7b4b1107 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -308,8 +308,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * b) the guest used a large mapping which we're faking
 			 * Either way, we need to satisfy the fault without
 			 * invoking the guest. */
-			kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
-			               gtlbe->word2);
+			kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
+			               gtlbe->word2, get_tlb_bytes(gtlbe));
 			vcpu->stat.dtlb_virt_miss_exits++;
 			r = RESUME_GUEST;
 		} else {
@@ -325,6 +325,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	case BOOKE_INTERRUPT_ITLB_MISS: {
 		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.pc;
+		gpa_t gpaddr;
 		gfn_t gfn;
 
 		r = RESUME_GUEST;
@@ -340,7 +341,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 		vcpu->stat.itlb_virt_miss_exits++;
 
-		gfn = tlb_xlate(gtlbe, eaddr) >> PAGE_SHIFT;
+		gpaddr = tlb_xlate(gtlbe, eaddr);
+		gfn = gpaddr >> PAGE_SHIFT;
 
 		if (kvm_is_visible_gfn(vcpu->kvm, gfn)) {
 			/* The guest TLB had a mapping, but the shadow TLB
@@ -349,8 +351,8 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * b) the guest used a large mapping which we're faking
 			 * Either way, we need to satisfy the fault without
 			 * invoking the guest. */
-			kvmppc_mmu_map(vcpu, eaddr, gfn, gtlbe->tid,
-			               gtlbe->word2);
+			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid,
+			               gtlbe->word2, get_tlb_bytes(gtlbe));
 		} else {
 			/* Guest mapped and leaped at non-RAM! */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);

From c0ca609c5f874f7d6ae8e180afe79317e1943d22 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 2 Dec 2008 15:51:54 -0600
Subject: [PATCH 108/140] powerpc/44x: declare tlb_44x_index for use in C code

KVM currently ignores the host's round robin TLB eviction selection, instead
maintaining its own TLB state and its own round robin index. However, by
participating in the normal 44x TLB selection, we can drop the alternate TLB
processing in KVM. This results in a significant performance improvement,
since that processing currently must be done on *every* guest exit.

Accordingly, KVM needs to be able to access and increment tlb_44x_index.
(KVM on 440 cannot be a module, so there is no need to export this symbol.)

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/mmu-44x.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/include/asm/mmu-44x.h b/arch/powerpc/include/asm/mmu-44x.h
index 8a97cfb08b7e..27cc6fdcd3b7 100644
--- a/arch/powerpc/include/asm/mmu-44x.h
+++ b/arch/powerpc/include/asm/mmu-44x.h
@@ -56,6 +56,7 @@
 #ifndef __ASSEMBLY__
 
 extern unsigned int tlb_44x_hwater;
+extern unsigned int tlb_44x_index;
 
 typedef struct {
 	unsigned int	id;

From 7924bd41097ae8991c6d38cef8b1e4058e30d198 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 2 Dec 2008 15:51:55 -0600
Subject: [PATCH 109/140] KVM: ppc: directly insert shadow mappings into the
 hardware TLB

Formerly, we used to maintain a per-vcpu shadow TLB and on every entry to the
guest would load this array into the hardware TLB. This consumed 1280 bytes of
memory (64 entries of 16 bytes plus a struct page pointer each), and also
required some assembly to loop over the array on every entry.

Instead of saving a copy in memory, we can just store shadow mappings directly
into the hardware TLB, accepting that the host kernel will clobber these as
part of the normal 440 TLB round robin. When we do that we need less than half
the memory, and we have decreased the exit handling time for all guest exits,
at the cost of increased number of TLB misses because the host overwrites some
guest entries.

These savings will be increased on processors with larger TLBs or which
implement intelligent flush instructions like tlbivax (which will avoid the
need to walk arrays in software).

In addition to that and to the code simplification, we have a greater chance of
leaving other host userspace mappings in the TLB, instead of forcing all
subsequent tasks to re-fault all their mappings.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_44x.h  |  22 ++-
 arch/powerpc/include/asm/kvm_ppc.h  |   3 +-
 arch/powerpc/kernel/asm-offsets.c   |   6 -
 arch/powerpc/kvm/44x.c              |  19 +--
 arch/powerpc/kvm/44x_tlb.c          | 252 ++++++++++++++--------------
 arch/powerpc/kvm/44x_tlb.h          |   7 +-
 arch/powerpc/kvm/booke.c            |  26 +--
 arch/powerpc/kvm/booke_interrupts.S |  48 ------
 8 files changed, 165 insertions(+), 218 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
index 72e593914adb..e770ea2bbb1c 100644
--- a/arch/powerpc/include/asm/kvm_44x.h
+++ b/arch/powerpc/include/asm/kvm_44x.h
@@ -22,19 +22,25 @@
 
 #include <linux/kvm_host.h>
 
-/* XXX Can't include mmu-44x.h because it redefines struct mm_context. */
 #define PPC44x_TLB_SIZE 64
 
+/* If the guest is expecting it, this can be as large as we like; we'd just
+ * need to find some way of advertising it. */
+#define KVM44x_GUEST_TLB_SIZE 64
+
+struct kvmppc_44x_shadow_ref {
+	struct page *page;
+	u16 gtlb_index;
+	u8 writeable;
+	u8 tid;
+};
+
 struct kvmppc_vcpu_44x {
 	/* Unmodified copy of the guest's TLB. */
-	struct kvmppc_44x_tlbe guest_tlb[PPC44x_TLB_SIZE];
-	/* TLB that's actually used when the guest is running. */
-	struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
-	/* Pages which are referenced in the shadow TLB. */
-	struct page *shadow_pages[PPC44x_TLB_SIZE];
+	struct kvmppc_44x_tlbe guest_tlb[KVM44x_GUEST_TLB_SIZE];
 
-	/* Track which TLB entries we've modified in the current exit. */
-	u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+	/* References to guest pages in the hardware TLB. */
+	struct kvmppc_44x_shadow_ref shadow_refs[PPC44x_TLB_SIZE];
 
 	struct kvm_vcpu vcpu;
 };
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 5bb29267d6a6..36d2a50a8487 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -53,7 +53,8 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu);
 extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
 
 extern void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
-                           u64 asid, u32 flags, u32 max_bytes);
+                           u64 asid, u32 flags, u32 max_bytes,
+                           unsigned int gtlb_idx);
 extern void kvmppc_mmu_priv_switch(struct kvm_vcpu *vcpu, int usermode);
 extern void kvmppc_mmu_switch_pid(struct kvm_vcpu *vcpu, u32 pid);
 
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 393c7f36a1e8..ba39526d3201 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -359,12 +359,6 @@ int main(void)
 #ifdef CONFIG_KVM
 	DEFINE(TLBE_BYTES, sizeof(struct kvmppc_44x_tlbe));
 
-	DEFINE(VCPU_TO_44X, offsetof(struct kvmppc_vcpu_44x, vcpu));
-	DEFINE(VCPU44x_SHADOW_TLB,
-	       offsetof(struct kvmppc_vcpu_44x, shadow_tlb));
-	DEFINE(VCPU44x_SHADOW_MOD,
-	       offsetof(struct kvmppc_vcpu_44x, shadow_tlb_mod));
-
 	DEFINE(VCPU_HOST_STACK, offsetof(struct kvm_vcpu, arch.host_stack));
 	DEFINE(VCPU_HOST_PID, offsetof(struct kvm_vcpu, arch.host_pid));
 	DEFINE(VCPU_GPRS, offsetof(struct kvm_vcpu, arch.gpr));
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 22054b164b5a..05d72fc8b478 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -96,21 +96,14 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
-	int i;
-
-	/* Mark every guest entry in the shadow TLB entry modified, so that they
-	 * will all be reloaded on the next vcpu run (instead of being
-	 * demand-faulted). */
-	for (i = 0; i <= tlb_44x_hwater; i++)
-		kvmppc_tlbe_set_modified(vcpu, i);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	/* Don't leave guest TLB entries resident when being de-scheduled. */
-	/* XXX It would be nice to differentiate between heavyweight exit and
-	 * sched_out here, since we could avoid the TLB flush for heavyweight
-	 * exits. */
+	/* XXX Since every guest uses TS=1 TID=0/1 mappings, we can't leave any TLB
+	 * entries around when we're descheduled, so we must completely flush the
+	 * TLB of all guest mappings. On the other hand, if there is only one
+	 * guest, this flush is completely unnecessary. */
 	_tlbia();
 }
 
@@ -130,6 +123,7 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[0];
+	int i;
 
 	tlbe->tid = 0;
 	tlbe->word0 = PPC44x_TLB_16M | PPC44x_TLB_VALID;
@@ -148,6 +142,9 @@ int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu)
 	 * CCR1[TCS]. */
 	vcpu->arch.ccr1 = mfspr(SPRN_CCR1);
 
+	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++)
+		vcpu_44x->shadow_refs[i].gtlb_index = -1;
+
 	return 0;
 }
 
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index d49dc66ab3c3..2981ebea3d1f 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -22,6 +22,8 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <linux/highmem.h>
+
+#include <asm/tlbflush.h>
 #include <asm/mmu-44x.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_44x.h>
@@ -40,8 +42,6 @@
 #define PPC44x_TLB_USER_PERM_MASK (PPC44x_TLB_UX|PPC44x_TLB_UR|PPC44x_TLB_UW)
 #define PPC44x_TLB_SUPER_PERM_MASK (PPC44x_TLB_SX|PPC44x_TLB_SR|PPC44x_TLB_SW)
 
-static unsigned int kvmppc_tlb_44x_pos;
-
 #ifdef DEBUG
 void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 {
@@ -52,24 +52,49 @@ void kvmppc_dump_tlbs(struct kvm_vcpu *vcpu)
 	printk("| %2s | %3s | %8s | %8s | %8s |\n",
 			"nr", "tid", "word0", "word1", "word2");
 
-	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+	for (i = 0; i < ARRAY_SIZE(vcpu_44x->guest_tlb); i++) {
 		tlbe = &vcpu_44x->guest_tlb[i];
 		if (tlbe->word0 & PPC44x_TLB_VALID)
 			printk(" G%2d |  %02X | %08X | %08X | %08X |\n",
 			       i, tlbe->tid, tlbe->word0, tlbe->word1,
 			       tlbe->word2);
 	}
-
-	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
-		tlbe = &vcpu_44x->shadow_tlb[i];
-		if (tlbe->word0 & PPC44x_TLB_VALID)
-			printk(" S%2d | %02X | %08X | %08X | %08X |\n",
-			       i, tlbe->tid, tlbe->word0, tlbe->word1,
-			       tlbe->word2);
-	}
 }
 #endif
 
+static inline void kvmppc_44x_tlbie(unsigned int index)
+{
+	/* 0 <= index < 64, so the V bit is clear and we can use the index as
+	 * word0. */
+	asm volatile(
+		"tlbwe %[index], %[index], 0\n"
+	:
+	: [index] "r"(index)
+	);
+}
+
+static inline void kvmppc_44x_tlbwe(unsigned int index,
+                                    struct kvmppc_44x_tlbe *stlbe)
+{
+	unsigned long tmp;
+
+	asm volatile(
+		"mfspr %[tmp], %[sprn_mmucr]\n"
+		"rlwimi %[tmp], %[tid], 0, 0xff\n"
+		"mtspr %[sprn_mmucr], %[tmp]\n"
+		"tlbwe %[word0], %[index], 0\n"
+		"tlbwe %[word1], %[index], 1\n"
+		"tlbwe %[word2], %[index], 2\n"
+		: [tmp]   "=&r"(tmp)
+		: [word0] "r"(stlbe->word0),
+		  [word1] "r"(stlbe->word1),
+		  [word2] "r"(stlbe->word2),
+		  [tid]   "r"(stlbe->tid),
+		  [index] "r"(index),
+		  [sprn_mmucr] "i"(SPRN_MMUCR)
+	);
+}
+
 static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 {
 	/* We only care about the guest's permission and user bits. */
@@ -99,7 +124,7 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
 	int i;
 
 	/* XXX Replace loop with fancy data structures. */
-	for (i = 0; i < PPC44x_TLB_SIZE; i++) {
+	for (i = 0; i < ARRAY_SIZE(vcpu_44x->guest_tlb); i++) {
 		struct kvmppc_44x_tlbe *tlbe = &vcpu_44x->guest_tlb[i];
 		unsigned int tid;
 
@@ -125,65 +150,53 @@ int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
 	return -1;
 }
 
-struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
-                                               gva_t eaddr)
+int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	unsigned int as = !!(vcpu->arch.msr & MSR_IS);
-	unsigned int index;
 
-	index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
-	if (index == -1)
-		return NULL;
-	return &vcpu_44x->guest_tlb[index];
+	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
-struct kvmppc_44x_tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu,
-                                               gva_t eaddr)
+int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr)
 {
-	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	unsigned int as = !!(vcpu->arch.msr & MSR_DS);
-	unsigned int index;
 
-	index = kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
-	if (index == -1)
-		return NULL;
-	return &vcpu_44x->guest_tlb[index];
+	return kvmppc_44x_tlb_index(vcpu, eaddr, vcpu->arch.pid, as);
 }
 
-static int kvmppc_44x_tlbe_is_writable(struct kvmppc_44x_tlbe *tlbe)
+static void kvmppc_44x_shadow_release(struct kvmppc_vcpu_44x *vcpu_44x,
+                                      unsigned int stlb_index)
 {
-	return tlbe->word2 & (PPC44x_TLB_SW|PPC44x_TLB_UW);
-}
+	struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[stlb_index];
 
-static void kvmppc_44x_shadow_release(struct kvm_vcpu *vcpu,
-                                      unsigned int index)
-{
-	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-	struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[index];
-	struct page *page = vcpu_44x->shadow_pages[index];
+	if (!ref->page)
+		return;
 
-	if (get_tlb_v(stlbe)) {
-		if (kvmppc_44x_tlbe_is_writable(stlbe))
-			kvm_release_page_dirty(page);
-		else
-			kvm_release_page_clean(page);
-	}
+	/* Discard from the TLB. */
+	/* Note: we could actually invalidate a host mapping, if the host overwrote
+	 * this TLB entry since we inserted a guest mapping. */
+	kvmppc_44x_tlbie(stlb_index);
+
+	/* Now release the page. */
+	if (ref->writeable)
+		kvm_release_page_dirty(ref->page);
+	else
+		kvm_release_page_clean(ref->page);
+
+	ref->page = NULL;
+
+	/* XXX set tlb_44x_index to stlb_index? */
+
+	KVMTRACE_1D(STLB_INVAL, &vcpu_44x->vcpu, stlb_index, handler);
 }
 
 void kvmppc_core_destroy_mmu(struct kvm_vcpu *vcpu)
 {
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 	int i;
 
 	for (i = 0; i <= tlb_44x_hwater; i++)
-		kvmppc_44x_shadow_release(vcpu, i);
-}
-
-void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
-{
-	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-
-	vcpu_44x->shadow_tlb_mod[i] = 1;
+		kvmppc_44x_shadow_release(vcpu_44x, i);
 }
 
 /**
@@ -199,21 +212,24 @@ void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i)
  * the shadow TLB.
  */
 void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
-                    u32 flags, u32 max_bytes)
+                    u32 flags, u32 max_bytes, unsigned int gtlb_index)
 {
+	struct kvmppc_44x_tlbe stlbe;
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	struct kvmppc_44x_shadow_ref *ref;
 	struct page *new_page;
-	struct kvmppc_44x_tlbe *stlbe;
 	hpa_t hpaddr;
 	gfn_t gfn;
 	unsigned int victim;
 
-	/* Future optimization: don't overwrite the TLB entry containing the
-	 * current PC (or stack?). */
-	victim = kvmppc_tlb_44x_pos++;
-	if (kvmppc_tlb_44x_pos > tlb_44x_hwater)
-		kvmppc_tlb_44x_pos = 0;
-	stlbe = &vcpu_44x->shadow_tlb[victim];
+	/* Select TLB entry to clobber. Indirectly guard against races with the TLB
+	 * miss handler by disabling interrupts. */
+	local_irq_disable();
+	victim = ++tlb_44x_index;
+	if (victim > tlb_44x_hwater)
+		victim = 0;
+	tlb_44x_index = victim;
+	local_irq_enable();
 
 	/* Get reference to new page. */
 	gfn = gpaddr >> PAGE_SHIFT;
@@ -225,10 +241,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
 	}
 	hpaddr = page_to_phys(new_page);
 
-	/* Drop reference to old page. */
-	kvmppc_44x_shadow_release(vcpu, victim);
-
-	vcpu_44x->shadow_pages[victim] = new_page;
+	/* Invalidate any previous shadow mappings. */
+	kvmppc_44x_shadow_release(vcpu_44x, victim);
 
 	/* XXX Make sure (va, size) doesn't overlap any other
 	 * entries. 440x6 user manual says the result would be
@@ -236,21 +250,19 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
 
 	/* XXX what about AS? */
 
-	stlbe->tid = !(asid & 0xff);
-
 	/* Force TS=1 for all guest mappings. */
-	stlbe->word0 = PPC44x_TLB_VALID | PPC44x_TLB_TS;
+	stlbe.word0 = PPC44x_TLB_VALID | PPC44x_TLB_TS;
 
 	if (max_bytes >= PAGE_SIZE) {
 		/* Guest mapping is larger than or equal to host page size. We can use
 		 * a "native" host mapping. */
-		stlbe->word0 |= (gvaddr & PAGE_MASK) | PPC44x_TLBE_SIZE;
+		stlbe.word0 |= (gvaddr & PAGE_MASK) | PPC44x_TLBE_SIZE;
 	} else {
 		/* Guest mapping is smaller than host page size. We must restrict the
 		 * size of the mapping to be at most the smaller of the two, but for
 		 * simplicity we fall back to a 4K mapping (this is probably what the
 		 * guest is using anyways). */
-		stlbe->word0 |= (gvaddr & PAGE_MASK_4K) | PPC44x_TLB_4K;
+		stlbe.word0 |= (gvaddr & PAGE_MASK_4K) | PPC44x_TLB_4K;
 
 		/* 'hpaddr' is a host page, which is larger than the mapping we're
 		 * inserting here. To compensate, we must add the in-page offset to the
@@ -258,47 +270,36 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
 		hpaddr |= gpaddr & (PAGE_MASK ^ PAGE_MASK_4K);
 	}
 
-	stlbe->word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
-	stlbe->word2 = kvmppc_44x_tlb_shadow_attrib(flags,
+	stlbe.word1 = (hpaddr & 0xfffffc00) | ((hpaddr >> 32) & 0xf);
+	stlbe.word2 = kvmppc_44x_tlb_shadow_attrib(flags,
 	                                            vcpu->arch.msr & MSR_PR);
-	kvmppc_tlbe_set_modified(vcpu, victim);
+	stlbe.tid = !(asid & 0xff);
 
-	KVMTRACE_5D(STLB_WRITE, vcpu, victim,
-			stlbe->tid, stlbe->word0, stlbe->word1, stlbe->word2,
-			handler);
+	/* Keep track of the reference so we can properly release it later. */
+	ref = &vcpu_44x->shadow_refs[victim];
+	ref->page = new_page;
+	ref->gtlb_index = gtlb_index;
+	ref->writeable = !!(stlbe.word2 & PPC44x_TLB_UW);
+	ref->tid = stlbe.tid;
+
+	/* Insert shadow mapping into hardware TLB. */
+	kvmppc_44x_tlbwe(victim, &stlbe);
+	KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1,
+	            stlbe.word2, handler);
 }
 
-static void kvmppc_mmu_invalidate(struct kvm_vcpu *vcpu, gva_t eaddr,
-                                  gva_t eend, u32 asid)
+/* For a particular guest TLB entry, invalidate the corresponding host TLB
+ * mappings and release the host pages. */
+static void kvmppc_44x_invalidate(struct kvm_vcpu *vcpu,
+                                  unsigned int gtlb_index)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-	unsigned int pid = !(asid & 0xff);
 	int i;
 
-	/* XXX Replace loop with fancy data structures. */
-	for (i = 0; i <= tlb_44x_hwater; i++) {
-		struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
-		unsigned int tid;
-
-		if (!get_tlb_v(stlbe))
-			continue;
-
-		if (eend < get_tlb_eaddr(stlbe))
-			continue;
-
-		if (eaddr > get_tlb_end(stlbe))
-			continue;
-
-		tid = get_tlb_tid(stlbe);
-		if (tid && (tid != pid))
-			continue;
-
-		kvmppc_44x_shadow_release(vcpu, i);
-		stlbe->word0 = 0;
-		kvmppc_tlbe_set_modified(vcpu, i);
-		KVMTRACE_5D(STLB_INVAL, vcpu, i,
-				stlbe->tid, stlbe->word0, stlbe->word1,
-				stlbe->word2, handler);
+	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) {
+		struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[i];
+		if (ref->gtlb_index == gtlb_index)
+			kvmppc_44x_shadow_release(vcpu_44x, i);
 	}
 }
 
@@ -321,14 +322,11 @@ void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid)
 	 * can't access guest kernel mappings (TID=1). When we switch to a new
 	 * guest PID, which will also use host PID=0, we must discard the old guest
 	 * userspace mappings. */
-	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_tlb); i++) {
-		struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+	for (i = 0; i < ARRAY_SIZE(vcpu_44x->shadow_refs); i++) {
+		struct kvmppc_44x_shadow_ref *ref = &vcpu_44x->shadow_refs[i];
 
-		if (get_tlb_tid(stlbe) == 0) {
-			kvmppc_44x_shadow_release(vcpu, i);
-			stlbe->word0 = 0;
-			kvmppc_tlbe_set_modified(vcpu, i);
-		}
+		if (ref->tid == 0)
+			kvmppc_44x_shadow_release(vcpu_44x, i);
 	}
 }
 
@@ -356,26 +354,21 @@ static int tlbe_is_host_safe(const struct kvm_vcpu *vcpu,
 int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 {
 	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
-	gva_t eaddr;
-	u64 asid;
 	struct kvmppc_44x_tlbe *tlbe;
-	unsigned int index;
+	unsigned int gtlb_index;
 
-	index = vcpu->arch.gpr[ra];
-	if (index > PPC44x_TLB_SIZE) {
-		printk("%s: index %d\n", __func__, index);
+	gtlb_index = vcpu->arch.gpr[ra];
+	if (gtlb_index > KVM44x_GUEST_TLB_SIZE) {
+		printk("%s: index %d\n", __func__, gtlb_index);
 		kvmppc_dump_vcpu(vcpu);
 		return EMULATE_FAIL;
 	}
 
-	tlbe = &vcpu_44x->guest_tlb[index];
+	tlbe = &vcpu_44x->guest_tlb[gtlb_index];
 
-	/* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */
-	if (tlbe->word0 & PPC44x_TLB_VALID) {
-		eaddr = get_tlb_eaddr(tlbe);
-		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
-		kvmppc_mmu_invalidate(vcpu, eaddr, get_tlb_end(tlbe), asid);
-	}
+	/* Invalidate shadow mappings for the about-to-be-clobbered TLB entry. */
+	if (tlbe->word0 & PPC44x_TLB_VALID)
+		kvmppc_44x_invalidate(vcpu, gtlb_index);
 
 	switch (ws) {
 	case PPC44x_TLB_PAGEID:
@@ -396,6 +389,8 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	}
 
 	if (tlbe_is_host_safe(vcpu, tlbe)) {
+		u64 asid;
+		gva_t eaddr;
 		gpa_t gpaddr;
 		u32 flags;
 		u32 bytes;
@@ -411,12 +406,11 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 		asid = (tlbe->word0 & PPC44x_TLB_TS) | tlbe->tid;
 		flags = tlbe->word2 & 0xffff;
 
-		kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes);
+		kvmppc_mmu_map(vcpu, eaddr, gpaddr, asid, flags, bytes, gtlb_index);
 	}
 
-	KVMTRACE_5D(GTLB_WRITE, vcpu, index,
-	            tlbe->tid, tlbe->word0, tlbe->word1, tlbe->word2,
-	            handler);
+	KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0,
+	            tlbe->word1, tlbe->word2, handler);
 
 	return EMULATE_DONE;
 }
@@ -424,7 +418,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
 {
 	u32 ea;
-	int index;
+	int gtlb_index;
 	unsigned int as = get_mmucr_sts(vcpu);
 	unsigned int pid = get_mmucr_stid(vcpu);
 
@@ -432,14 +426,14 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
 	if (ra)
 		ea += vcpu->arch.gpr[ra];
 
-	index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
+	gtlb_index = kvmppc_44x_tlb_index(vcpu, ea, pid, as);
 	if (rc) {
-		if (index < 0)
+		if (gtlb_index < 0)
 			vcpu->arch.cr &= ~0x20000000;
 		else
 			vcpu->arch.cr |= 0x20000000;
 	}
-	vcpu->arch.gpr[rt] = index;
+	vcpu->arch.gpr[rt] = gtlb_index;
 
 	return EMULATE_DONE;
 }
diff --git a/arch/powerpc/kvm/44x_tlb.h b/arch/powerpc/kvm/44x_tlb.h
index b1029af3de20..772191f29e62 100644
--- a/arch/powerpc/kvm/44x_tlb.h
+++ b/arch/powerpc/kvm/44x_tlb.h
@@ -25,11 +25,8 @@
 
 extern int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr,
                                 unsigned int pid, unsigned int as);
-extern struct kvmppc_44x_tlbe *kvmppc_44x_dtlb_search(struct kvm_vcpu *vcpu,
-                                                      gva_t eaddr);
-extern struct kvmppc_44x_tlbe *kvmppc_44x_itlb_search(struct kvm_vcpu *vcpu,
-                                                      gva_t eaddr);
-extern void kvmppc_tlbe_set_modified(struct kvm_vcpu *vcpu, unsigned int i);
+extern int kvmppc_44x_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
+extern int kvmppc_44x_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr);
 
 extern int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb,
                                  u8 rc);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 924c7b4b1107..eb24383c87d2 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -24,10 +24,12 @@
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/fs.h>
+
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/cacheflush.h>
+#include <asm/kvm_44x.h>
 
 #include "booke.h"
 #include "44x_tlb.h"
@@ -207,10 +209,6 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		 * handled this interrupt the moment we enabled interrupts.
 		 * Now we just offer it a chance to reschedule the guest. */
 
-		/* XXX At this point the TLB still holds our shadow TLB, so if
-		 * we do reschedule the host will fault over it. Perhaps we
-		 * should politely restore the host's entries to minimize
-		 * misses before ceding control. */
 		vcpu->stat.dec_exits++;
 		if (need_resched())
 			cond_resched();
@@ -281,14 +279,17 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		r = RESUME_GUEST;
 		break;
 
+	/* XXX move to a 440-specific file. */
 	case BOOKE_INTERRUPT_DTLB_MISS: {
+		struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.fault_dear;
+		int gtlb_index;
 		gfn_t gfn;
 
 		/* Check the guest TLB. */
-		gtlbe = kvmppc_44x_dtlb_search(vcpu, eaddr);
-		if (!gtlbe) {
+		gtlb_index = kvmppc_44x_dtlb_index(vcpu, eaddr);
+		if (gtlb_index < 0) {
 			/* The guest didn't have a mapping for it. */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
 			vcpu->arch.dear = vcpu->arch.fault_dear;
@@ -298,6 +299,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			break;
 		}
 
+		gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
 		vcpu->arch.paddr_accessed = tlb_xlate(gtlbe, eaddr);
 		gfn = vcpu->arch.paddr_accessed >> PAGE_SHIFT;
 
@@ -309,7 +311,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * Either way, we need to satisfy the fault without
 			 * invoking the guest. */
 			kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
-			               gtlbe->word2, get_tlb_bytes(gtlbe));
+			               gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
 			vcpu->stat.dtlb_virt_miss_exits++;
 			r = RESUME_GUEST;
 		} else {
@@ -322,17 +324,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 	}
 
+	/* XXX move to a 440-specific file. */
 	case BOOKE_INTERRUPT_ITLB_MISS: {
+		struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
 		struct kvmppc_44x_tlbe *gtlbe;
 		unsigned long eaddr = vcpu->arch.pc;
 		gpa_t gpaddr;
 		gfn_t gfn;
+		int gtlb_index;
 
 		r = RESUME_GUEST;
 
 		/* Check the guest TLB. */
-		gtlbe = kvmppc_44x_itlb_search(vcpu, eaddr);
-		if (!gtlbe) {
+		gtlb_index = kvmppc_44x_itlb_index(vcpu, eaddr);
+		if (gtlb_index < 0) {
 			/* The guest didn't have a mapping for it. */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
 			vcpu->stat.itlb_real_miss_exits++;
@@ -341,6 +346,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 		vcpu->stat.itlb_virt_miss_exits++;
 
+		gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
 		gpaddr = tlb_xlate(gtlbe, eaddr);
 		gfn = gpaddr >> PAGE_SHIFT;
 
@@ -352,7 +358,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * Either way, we need to satisfy the fault without
 			 * invoking the guest. */
 			kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlbe->tid,
-			               gtlbe->word2, get_tlb_bytes(gtlbe));
+			               gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
 		} else {
 			/* Guest mapped and leaped at non-RAM! */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK);
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 8d6929b7fdb6..eb2186823e4e 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -335,54 +335,6 @@ lightweight_exit:
 	lwz	r3, VCPU_SHADOW_PID(r4)
 	mtspr	SPRN_PID, r3
 
-	/* Prevent all asynchronous TLB updates. */
-	mfmsr	r5
-	lis	r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@h
-	ori	r6, r6, (MSR_EE|MSR_CE|MSR_ME|MSR_DE)@l
-	andc	r6, r5, r6
-	mtmsr	r6
-
-	/* Load the guest mappings, leaving the host's "pinned" kernel mappings
-	 * in place. */
-	mfspr	r10, SPRN_MMUCR			/* Save host MMUCR. */
-	li	r5, PPC44x_TLB_SIZE
-	lis	r5, tlb_44x_hwater@ha
-	lwz	r5, tlb_44x_hwater@l(r5)
-	mtctr	r5
-	addi	r9, r4, -VCPU_TO_44X + VCPU44x_SHADOW_TLB
-	addi	r5, r4, -VCPU_TO_44X + VCPU44x_SHADOW_MOD
-	li	r3, 0
-1:
-	lbzx	r7, r3, r5
-	cmpwi	r7, 0
-	beq	3f
-
-	/* Load guest entry. */
-	mulli	r11, r3, TLBE_BYTES
-	add	r11, r11, r9
-	lwz	r7, 0(r11)
-	mtspr	SPRN_MMUCR, r7
-	lwz	r7, 4(r11)
-	tlbwe	r7, r3, PPC44x_TLB_PAGEID
-	lwz	r7, 8(r11)
-	tlbwe	r7, r3, PPC44x_TLB_XLAT
-	lwz	r7, 12(r11)
-	tlbwe	r7, r3, PPC44x_TLB_ATTRIB
-3:
-	addi	r3, r3, 1                       /* Increment index. */
-	bdnz	1b
-
-	mtspr	SPRN_MMUCR, r10			/* Restore host MMUCR. */
-
-	/* Clear bitmap of modified TLB entries */
-	li	r5, PPC44x_TLB_SIZE>>2
-	mtctr	r5
-	addi	r5, r4, -VCPU_TO_44X + VCPU44x_SHADOW_MOD - 4
-	li	r6, 0
-1:
-	stwu	r6, 4(r5)
-	bdnz	1b
-
 	iccci	0, 0 /* XXX hack */
 
 	/* Load some guest volatiles. */

From c5fbdffbda79254047ec83b09c1a61a3655d052a Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 2 Dec 2008 15:51:56 -0600
Subject: [PATCH 110/140] KVM: ppc: save and restore guest mappings on context
 switch

Store shadow TLB entries in memory, but only use it on host context switch
(instead of every guest entry). This improves performance for most workloads on
440 by reducing the guest TLB miss rate.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_44x.h |  6 ++++
 arch/powerpc/kvm/44x.c             |  7 ++--
 arch/powerpc/kvm/44x_tlb.c         | 58 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_44x.h b/arch/powerpc/include/asm/kvm_44x.h
index e770ea2bbb1c..f49031b632ca 100644
--- a/arch/powerpc/include/asm/kvm_44x.h
+++ b/arch/powerpc/include/asm/kvm_44x.h
@@ -42,6 +42,10 @@ struct kvmppc_vcpu_44x {
 	/* References to guest pages in the hardware TLB. */
 	struct kvmppc_44x_shadow_ref shadow_refs[PPC44x_TLB_SIZE];
 
+	/* State of the shadow TLB at guest context switch time. */
+	struct kvmppc_44x_tlbe shadow_tlb[PPC44x_TLB_SIZE];
+	u8 shadow_tlb_mod[PPC44x_TLB_SIZE];
+
 	struct kvm_vcpu vcpu;
 };
 
@@ -51,5 +55,7 @@ static inline struct kvmppc_vcpu_44x *to_44x(struct kvm_vcpu *vcpu)
 }
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 new_pid);
+void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu);
+void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu);
 
 #endif /* __ASM_44X_H__ */
diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
index 05d72fc8b478..a66bec57265a 100644
--- a/arch/powerpc/kvm/44x.c
+++ b/arch/powerpc/kvm/44x.c
@@ -96,15 +96,12 @@ void kvmppc_core_load_guest_debugstate(struct kvm_vcpu *vcpu)
 
 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
+	kvmppc_44x_tlb_load(vcpu);
 }
 
 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu)
 {
-	/* XXX Since every guest uses TS=1 TID=0/1 mappings, we can't leave any TLB
-	 * entries around when we're descheduled, so we must completely flush the
-	 * TLB of all guest mappings. On the other hand, if there is only one
-	 * guest, this flush is completely unnecessary. */
-	_tlbia();
+	kvmppc_44x_tlb_put(vcpu);
 }
 
 int kvmppc_core_check_processor_compat(void)
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index 2981ebea3d1f..ff16d0e38433 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -73,6 +73,25 @@ static inline void kvmppc_44x_tlbie(unsigned int index)
 	);
 }
 
+static inline void kvmppc_44x_tlbre(unsigned int index,
+                                    struct kvmppc_44x_tlbe *tlbe)
+{
+	asm volatile(
+		"tlbre %[word0], %[index], 0\n"
+		"mfspr %[tid], %[sprn_mmucr]\n"
+		"andi. %[tid], %[tid], 0xff\n"
+		"tlbre %[word1], %[index], 1\n"
+		"tlbre %[word2], %[index], 2\n"
+		: [word0] "=r"(tlbe->word0),
+		  [word1] "=r"(tlbe->word1),
+		  [word2] "=r"(tlbe->word2),
+		  [tid]   "=r"(tlbe->tid)
+		: [index] "r"(index),
+		  [sprn_mmucr] "i"(SPRN_MMUCR)
+		: "cc"
+	);
+}
+
 static inline void kvmppc_44x_tlbwe(unsigned int index,
                                     struct kvmppc_44x_tlbe *stlbe)
 {
@@ -116,6 +135,44 @@ static u32 kvmppc_44x_tlb_shadow_attrib(u32 attrib, int usermode)
 	return attrib;
 }
 
+/* Load shadow TLB back into hardware. */
+void kvmppc_44x_tlb_load(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	int i;
+
+	for (i = 0; i <= tlb_44x_hwater; i++) {
+		struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+
+		if (get_tlb_v(stlbe) && get_tlb_ts(stlbe))
+			kvmppc_44x_tlbwe(i, stlbe);
+	}
+}
+
+static void kvmppc_44x_tlbe_set_modified(struct kvmppc_vcpu_44x *vcpu_44x,
+                                         unsigned int i)
+{
+	vcpu_44x->shadow_tlb_mod[i] = 1;
+}
+
+/* Save hardware TLB to the vcpu, and invalidate all guest mappings. */
+void kvmppc_44x_tlb_put(struct kvm_vcpu *vcpu)
+{
+	struct kvmppc_vcpu_44x *vcpu_44x = to_44x(vcpu);
+	int i;
+
+	for (i = 0; i <= tlb_44x_hwater; i++) {
+		struct kvmppc_44x_tlbe *stlbe = &vcpu_44x->shadow_tlb[i];
+
+		if (vcpu_44x->shadow_tlb_mod[i])
+			kvmppc_44x_tlbre(i, stlbe);
+
+		if (get_tlb_v(stlbe) && get_tlb_ts(stlbe))
+			kvmppc_44x_tlbie(i);
+	}
+}
+
+
 /* Search the guest TLB for a matching entry. */
 int kvmppc_44x_tlb_index(struct kvm_vcpu *vcpu, gva_t eaddr, unsigned int pid,
                          unsigned int as)
@@ -283,6 +340,7 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr, u64 asid,
 	ref->tid = stlbe.tid;
 
 	/* Insert shadow mapping into hardware TLB. */
+	kvmppc_44x_tlbe_set_modified(vcpu_44x, victim);
 	kvmppc_44x_tlbwe(victim, &stlbe);
 	KVMTRACE_5D(STLB_WRITE, vcpu, victim, stlbe.tid, stlbe.word0, stlbe.word1,
 	            stlbe.word2, handler);

From 73e75b416ffcfa3a84952d8e389a0eca080f00e1 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 2 Dec 2008 15:51:57 -0600
Subject: [PATCH 111/140] KVM: ppc: Implement in-kernel exit timing statistics

Existing KVM statistics are either just counters (kvm_stat) reported for
KVM generally or trace based aproaches like kvm_trace.
For KVM on powerpc we had the need to track the timings of the different exit
types. While this could be achieved parsing data created with a kvm_trace
extension this adds too much overhead (at least on embedded PowerPC) slowing
down the workloads we wanted to measure.

Therefore this patch adds a in-kernel exit timing statistic to the powerpc kvm
code. These statistic is available per vm&vcpu under the kvm debugfs directory.
As this statistic is low, but still some overhead it can be enabled via a
.config entry and should be off by default.

Since this patch touched all powerpc kvm_stat code anyway this code is now
merged and simplified together with the exit timing statistic code (still
working with exit timing disabled in .config).

Signed-off-by: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h |  56 ++++++
 arch/powerpc/kernel/asm-offsets.c   |  11 ++
 arch/powerpc/kvm/44x_emulate.c      |  11 +-
 arch/powerpc/kvm/44x_tlb.c          |   3 +
 arch/powerpc/kvm/Kconfig            |  11 ++
 arch/powerpc/kvm/Makefile           |   1 +
 arch/powerpc/kvm/booke.c            |  36 ++--
 arch/powerpc/kvm/booke.h            |   5 +-
 arch/powerpc/kvm/booke_interrupts.S |  24 +++
 arch/powerpc/kvm/emulate.c          |   4 +
 arch/powerpc/kvm/powerpc.c          |   8 +-
 arch/powerpc/kvm/timing.c           | 262 ++++++++++++++++++++++++++++
 arch/powerpc/kvm/timing.h           | 102 +++++++++++
 13 files changed, 516 insertions(+), 18 deletions(-)
 create mode 100644 arch/powerpc/kvm/timing.c
 create mode 100644 arch/powerpc/kvm/timing.h

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a4a7d5ef6cf8..2f5b49f2a98e 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -71,6 +71,49 @@ struct kvmppc_44x_tlbe {
 	u32 word2;
 };
 
+enum kvm_exit_types {
+	MMIO_EXITS,
+	DCR_EXITS,
+	SIGNAL_EXITS,
+	ITLB_REAL_MISS_EXITS,
+	ITLB_VIRT_MISS_EXITS,
+	DTLB_REAL_MISS_EXITS,
+	DTLB_VIRT_MISS_EXITS,
+	SYSCALL_EXITS,
+	ISI_EXITS,
+	DSI_EXITS,
+	EMULATED_INST_EXITS,
+	EMULATED_MTMSRWE_EXITS,
+	EMULATED_WRTEE_EXITS,
+	EMULATED_MTSPR_EXITS,
+	EMULATED_MFSPR_EXITS,
+	EMULATED_MTMSR_EXITS,
+	EMULATED_MFMSR_EXITS,
+	EMULATED_TLBSX_EXITS,
+	EMULATED_TLBWE_EXITS,
+	EMULATED_RFI_EXITS,
+	DEC_EXITS,
+	EXT_INTR_EXITS,
+	HALT_WAKEUP,
+	USR_PR_INST,
+	FP_UNAVAIL,
+	DEBUG_EXITS,
+	TIMEINGUEST,
+	__NUMBER_OF_KVM_EXIT_TYPES
+};
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+/* allow access to big endian 32bit upper/lower parts and 64bit var */
+struct exit_timing {
+	union {
+		u64 tv64;
+		struct {
+			u32 tbu, tbl;
+		} tv32;
+	};
+};
+#endif
+
 struct kvm_arch {
 };
 
@@ -130,6 +173,19 @@ struct kvm_vcpu_arch {
 	u32 dbcr0;
 	u32 dbcr1;
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+	struct exit_timing timing_exit;
+	struct exit_timing timing_last_enter;
+	u32 last_exit_type;
+	u32 timing_count_type[__NUMBER_OF_KVM_EXIT_TYPES];
+	u64 timing_sum_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+	u64 timing_sum_quad_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+	u64 timing_min_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+	u64 timing_max_duration[__NUMBER_OF_KVM_EXIT_TYPES];
+	u64 timing_last_exit;
+	struct dentry *debugfs_exit_timing;
+#endif
+
 	u32 last_inst;
 	ulong fault_dear;
 	ulong fault_esr;
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index ba39526d3201..9937fe44555f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -383,5 +383,16 @@ int main(void)
 	DEFINE(PTE_T_LOG2, PTE_T_LOG2);
 #endif
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+	DEFINE(VCPU_TIMING_EXIT_TBU, offsetof(struct kvm_vcpu,
+						arch.timing_exit.tv32.tbu));
+	DEFINE(VCPU_TIMING_EXIT_TBL, offsetof(struct kvm_vcpu,
+						arch.timing_exit.tv32.tbl));
+	DEFINE(VCPU_TIMING_LAST_ENTER_TBU, offsetof(struct kvm_vcpu,
+					arch.timing_last_enter.tv32.tbu));
+	DEFINE(VCPU_TIMING_LAST_ENTER_TBL, offsetof(struct kvm_vcpu,
+					arch.timing_last_enter.tv32.tbl));
+#endif
+
 	return 0;
 }
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 9ef79c78ede9..69f88d53c428 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -22,6 +22,7 @@
 #include <asm/dcr-regs.h>
 #include <asm/disassemble.h>
 #include <asm/kvm_44x.h>
+#include "timing.h"
 
 #include "booke.h"
 #include "44x_tlb.h"
@@ -58,11 +59,11 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	int ws;
 
 	switch (get_op(inst)) {
-
 	case OP_RFI:
 		switch (get_xop(inst)) {
 		case XOP_RFI:
 			kvmppc_emul_rfi(vcpu);
+			kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS);
 			*advance = 0;
 			break;
 
@@ -78,10 +79,12 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		case XOP_MFMSR:
 			rt = get_rt(inst);
 			vcpu->arch.gpr[rt] = vcpu->arch.msr;
+			kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS);
 			break;
 
 		case XOP_MTMSR:
 			rs = get_rs(inst);
+			kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS);
 			kvmppc_set_msr(vcpu, vcpu->arch.gpr[rs]);
 			break;
 
@@ -89,11 +92,13 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			rs = get_rs(inst);
 			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
 							 | (vcpu->arch.gpr[rs] & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
 			break;
 
 		case XOP_WRTEEI:
 			vcpu->arch.msr = (vcpu->arch.msr & ~MSR_EE)
 							 | (inst & MSR_EE);
+			kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS);
 			break;
 
 		case XOP_MFDCR:
@@ -127,6 +132,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				run->dcr.is_write = 0;
 				vcpu->arch.io_gpr = rt;
 				vcpu->arch.dcr_needed = 1;
+				account_exit(vcpu, DCR_EXITS);
 				emulated = EMULATE_DO_DCR;
 			}
 
@@ -146,6 +152,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				run->dcr.data = vcpu->arch.gpr[rs];
 				run->dcr.is_write = 1;
 				vcpu->arch.dcr_needed = 1;
+				account_exit(vcpu, DCR_EXITS);
 				emulated = EMULATE_DO_DCR;
 			}
 
@@ -276,6 +283,7 @@ int kvmppc_core_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
 		return EMULATE_FAIL;
 	}
 
+	kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS);
 	return EMULATE_DONE;
 }
 
@@ -357,6 +365,7 @@ int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt)
 		return EMULATE_FAIL;
 	}
 
+	kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS);
 	return EMULATE_DONE;
 }
 
diff --git a/arch/powerpc/kvm/44x_tlb.c b/arch/powerpc/kvm/44x_tlb.c
index ff16d0e38433..9a34b8edb9e2 100644
--- a/arch/powerpc/kvm/44x_tlb.c
+++ b/arch/powerpc/kvm/44x_tlb.c
@@ -27,6 +27,7 @@
 #include <asm/mmu-44x.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_44x.h>
+#include "timing.h"
 
 #include "44x_tlb.h"
 
@@ -470,6 +471,7 @@ int kvmppc_44x_emul_tlbwe(struct kvm_vcpu *vcpu, u8 ra, u8 rs, u8 ws)
 	KVMTRACE_5D(GTLB_WRITE, vcpu, gtlb_index, tlbe->tid, tlbe->word0,
 	            tlbe->word1, tlbe->word2, handler);
 
+	kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS);
 	return EMULATE_DONE;
 }
 
@@ -493,5 +495,6 @@ int kvmppc_44x_emul_tlbsx(struct kvm_vcpu *vcpu, u8 rt, u8 ra, u8 rb, u8 rc)
 	}
 	vcpu->arch.gpr[rt] = gtlb_index;
 
+	kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS);
 	return EMULATE_DONE;
 }
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index e4ab1c7fd925..6dbdc4817d80 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -32,6 +32,17 @@ config KVM_440
 
 	  If unsure, say N.
 
+config KVM_EXIT_TIMING
+	bool "Detailed exit timing"
+	depends on KVM
+	---help---
+	  Calculate elapsed time for every exit/enter cycle. A per-vcpu
+	  report is available in debugfs kvm/vm#_vcpu#_timing.
+	  The overhead is relatively small, however it is not recommended for
+	  production environments.
+
+	  If unsure, say N.
+
 config KVM_TRACE
 	bool "KVM trace support"
 	depends on KVM && MARKERS && SYSFS
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f045fad0f4f1..df7ba59e6d53 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -9,6 +9,7 @@ common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o)
 common-objs-$(CONFIG_KVM_TRACE)  += $(addprefix ../../../virt/kvm/, kvm_trace.o)
 
 kvm-objs := $(common-objs-y) powerpc.o emulate.o
+obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
 obj-$(CONFIG_KVM) += kvm.o
 
 AFLAGS_booke_interrupts.o := -I$(obj)
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index eb24383c87d2..0f171248e450 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -28,6 +28,7 @@
 #include <asm/cputable.h>
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
+#include "timing.h"
 #include <asm/cacheflush.h>
 #include <asm/kvm_44x.h>
 
@@ -185,6 +186,9 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	enum emulation_result er;
 	int r = RESUME_HOST;
 
+	/* update before a new last_exit_type is rewritten */
+	kvmppc_update_timing_stats(vcpu);
+
 	local_irq_enable();
 
 	run->exit_reason = KVM_EXIT_UNKNOWN;
@@ -198,7 +202,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
-		vcpu->stat.ext_intr_exits++;
+		account_exit(vcpu, EXT_INTR_EXITS);
 		if (need_resched())
 			cond_resched();
 		r = RESUME_GUEST;
@@ -208,8 +212,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* Since we switched IVPR back to the host's value, the host
 		 * handled this interrupt the moment we enabled interrupts.
 		 * Now we just offer it a chance to reschedule the guest. */
-
-		vcpu->stat.dec_exits++;
+		account_exit(vcpu, DEC_EXITS);
 		if (need_resched())
 			cond_resched();
 		r = RESUME_GUEST;
@@ -222,20 +225,21 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			vcpu->arch.esr = vcpu->arch.fault_esr;
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 			r = RESUME_GUEST;
+			account_exit(vcpu, USR_PR_INST);
 			break;
 		}
 
 		er = kvmppc_emulate_instruction(run, vcpu);
 		switch (er) {
 		case EMULATE_DONE:
+			/* don't overwrite subtypes, just account kvm_stats */
+			account_exit_stat(vcpu, EMULATED_INST_EXITS);
 			/* Future optimization: only reload non-volatiles if
 			 * they were actually modified by emulation. */
-			vcpu->stat.emulated_inst_exits++;
 			r = RESUME_GUEST_NV;
 			break;
 		case EMULATE_DO_DCR:
 			run->exit_reason = KVM_EXIT_DCR;
-			vcpu->stat.dcr_exits++;
 			r = RESUME_HOST;
 			break;
 		case EMULATE_FAIL:
@@ -255,6 +259,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	case BOOKE_INTERRUPT_FP_UNAVAIL:
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
+		account_exit(vcpu, FP_UNAVAIL);
 		r = RESUME_GUEST;
 		break;
 
@@ -262,20 +267,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
-		vcpu->stat.dsi_exits++;
+		account_exit(vcpu, DSI_EXITS);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_INST_STORAGE:
 		vcpu->arch.esr = vcpu->arch.fault_esr;
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
-		vcpu->stat.isi_exits++;
+		account_exit(vcpu, ISI_EXITS);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_SYSCALL:
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
-		vcpu->stat.syscall_exits++;
+		account_exit(vcpu, SYSCALL_EXITS);
 		r = RESUME_GUEST;
 		break;
 
@@ -294,7 +299,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
 			vcpu->arch.dear = vcpu->arch.fault_dear;
 			vcpu->arch.esr = vcpu->arch.fault_esr;
-			vcpu->stat.dtlb_real_miss_exits++;
+			account_exit(vcpu, DTLB_REAL_MISS_EXITS);
 			r = RESUME_GUEST;
 			break;
 		}
@@ -312,13 +317,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * invoking the guest. */
 			kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
 			               gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
-			vcpu->stat.dtlb_virt_miss_exits++;
+			account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
 			r = RESUME_GUEST;
 		} else {
 			/* Guest has mapped and accessed a page which is not
 			 * actually RAM. */
 			r = kvmppc_emulate_mmio(run, vcpu);
-			vcpu->stat.mmio_exits++;
+			account_exit(vcpu, MMIO_EXITS);
 		}
 
 		break;
@@ -340,11 +345,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (gtlb_index < 0) {
 			/* The guest didn't have a mapping for it. */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
-			vcpu->stat.itlb_real_miss_exits++;
+			account_exit(vcpu, ITLB_REAL_MISS_EXITS);
 			break;
 		}
 
-		vcpu->stat.itlb_virt_miss_exits++;
+		account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
 
 		gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
 		gpaddr = tlb_xlate(gtlbe, eaddr);
@@ -378,6 +383,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		mtspr(SPRN_DBSR, dbsr);
 
 		run->exit_reason = KVM_EXIT_DEBUG;
+		account_exit(vcpu, DEBUG_EXITS);
 		r = RESUME_HOST;
 		break;
 	}
@@ -398,7 +404,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (signal_pending(current)) {
 			run->exit_reason = KVM_EXIT_INTR;
 			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-			vcpu->stat.signal_exits++;
+			account_exit(vcpu, SIGNAL_EXITS);
 		}
 	}
 
@@ -418,6 +424,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	 * before it's programmed its own IVPR. */
 	vcpu->arch.ivpr = 0x55550000;
 
+	kvmppc_init_timing_stats(vcpu);
+
 	return kvmppc_core_vcpu_setup(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 48d905fd60ab..cf7c94ca24bf 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -22,6 +22,7 @@
 
 #include <linux/types.h>
 #include <linux/kvm_host.h>
+#include "timing.h"
 
 /* interrupt priortity ordering */
 #define BOOKE_IRQPRIO_DATA_STORAGE 0
@@ -50,8 +51,10 @@ static inline void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr)
 
 	vcpu->arch.msr = new_msr;
 
-	if (vcpu->arch.msr & MSR_WE)
+	if (vcpu->arch.msr & MSR_WE) {
 		kvm_vcpu_block(vcpu);
+		kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS);
+	};
 }
 
 #endif /* __KVM_BOOKE_H__ */
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index eb2186823e4e..084ebcd7dd83 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -107,6 +107,18 @@ _GLOBAL(kvmppc_resume_host)
 	li	r6, 1
 	slw	r6, r6, r5
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save exit time */
+1:
+	mfspr	r7, SPRN_TBRU
+	mfspr	r8, SPRN_TBRL
+	mfspr	r9, SPRN_TBRU
+	cmpw	r9, r7
+	bne	1b
+	stw	r8, VCPU_TIMING_EXIT_TBL(r4)
+	stw	r9, VCPU_TIMING_EXIT_TBU(r4)
+#endif
+
 	/* Save the faulting instruction and all GPRs for emulation. */
 	andi.	r7, r6, NEED_INST_MASK
 	beq	..skip_inst_copy
@@ -375,6 +387,18 @@ lightweight_exit:
 	lwz	r3, VCPU_SPRG7(r4)
 	mtspr	SPRN_SPRG7, r3
 
+#ifdef CONFIG_KVM_EXIT_TIMING
+	/* save enter time */
+1:
+	mfspr	r6, SPRN_TBRU
+	mfspr	r7, SPRN_TBRL
+	mfspr	r8, SPRN_TBRU
+	cmpw	r8, r6
+	bne	1b
+	stw	r7, VCPU_TIMING_LAST_ENTER_TBL(r4)
+	stw	r8, VCPU_TIMING_LAST_ENTER_TBU(r4)
+#endif
+
 	/* Finish loading guest volatiles and jump to guest. */
 	lwz	r3, VCPU_CTR(r4)
 	mtctr	r3
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 4c30fa0c31ea..d1d38daa93fb 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -28,6 +28,7 @@
 #include <asm/byteorder.h>
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
+#include "timing.h"
 
 void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
 {
@@ -73,6 +74,9 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	enum emulation_result emulated = EMULATE_DONE;
 	int advance = 1;
 
+	/* this default type might be overwritten by subcategories */
+	kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS);
+
 	switch (get_op(inst)) {
 	case 3:                                             /* trap */
 		vcpu->arch.esr |= ESR_PTR;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 7ad150e0fbbf..1deda37cb771 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -28,9 +28,9 @@
 #include <asm/uaccess.h>
 #include <asm/kvm_ppc.h>
 #include <asm/tlbflush.h>
+#include "timing.h"
 #include "../mm/mmu_decl.h"
 
-
 gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
 {
 	return gfn;
@@ -171,11 +171,15 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 
 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
 {
-	return kvmppc_core_vcpu_create(kvm, id);
+	struct kvm_vcpu *vcpu;
+	vcpu = kvmppc_core_vcpu_create(kvm, id);
+	kvmppc_create_vcpu_debugfs(vcpu, id);
+	return vcpu;
 }
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 {
+	kvmppc_remove_vcpu_debugfs(vcpu);
 	kvmppc_core_vcpu_free(vcpu);
 }
 
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
new file mode 100644
index 000000000000..f42d2728a6a5
--- /dev/null
+++ b/arch/powerpc/kvm/timing.c
@@ -0,0 +1,262 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2007
+ *
+ * Authors: Hollis Blanchard <hollisb@us.ibm.com>
+ *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+
+#include "timing.h"
+#include <asm/time.h>
+#include <asm-generic/div64.h>
+
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	/* pause guest execution to avoid concurrent updates */
+	local_irq_disable();
+	mutex_lock(&vcpu->mutex);
+
+	vcpu->arch.last_exit_type = 0xDEAD;
+	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+		vcpu->arch.timing_count_type[i] = 0;
+		vcpu->arch.timing_max_duration[i] = 0;
+		vcpu->arch.timing_min_duration[i] = 0xFFFFFFFF;
+		vcpu->arch.timing_sum_duration[i] = 0;
+		vcpu->arch.timing_sum_quad_duration[i] = 0;
+	}
+	vcpu->arch.timing_last_exit = 0;
+	vcpu->arch.timing_exit.tv64 = 0;
+	vcpu->arch.timing_last_enter.tv64 = 0;
+
+	mutex_unlock(&vcpu->mutex);
+	local_irq_enable();
+}
+
+static void add_exit_timing(struct kvm_vcpu *vcpu,
+					u64 duration, int type)
+{
+	u64 old;
+
+	do_div(duration, tb_ticks_per_usec);
+	if (unlikely(duration > 0xFFFFFFFF)) {
+		printk(KERN_ERR"%s - duration too big -> overflow"
+			" duration %lld type %d exit #%d\n",
+			__func__, duration, type,
+			vcpu->arch.timing_count_type[type]);
+		return;
+	}
+
+	vcpu->arch.timing_count_type[type]++;
+
+	/* sum */
+	old = vcpu->arch.timing_sum_duration[type];
+	vcpu->arch.timing_sum_duration[type] += duration;
+	if (unlikely(old > vcpu->arch.timing_sum_duration[type])) {
+		printk(KERN_ERR"%s - wrap adding sum of durations"
+			" old %lld new %lld type %d exit # of type %d\n",
+			__func__, old, vcpu->arch.timing_sum_duration[type],
+			type, vcpu->arch.timing_count_type[type]);
+	}
+
+	/* square sum */
+	old = vcpu->arch.timing_sum_quad_duration[type];
+	vcpu->arch.timing_sum_quad_duration[type] += (duration*duration);
+	if (unlikely(old > vcpu->arch.timing_sum_quad_duration[type])) {
+		printk(KERN_ERR"%s - wrap adding sum of squared durations"
+			" old %lld new %lld type %d exit # of type %d\n",
+			__func__, old,
+			vcpu->arch.timing_sum_quad_duration[type],
+			type, vcpu->arch.timing_count_type[type]);
+	}
+
+	/* set min/max */
+	if (unlikely(duration < vcpu->arch.timing_min_duration[type]))
+		vcpu->arch.timing_min_duration[type] = duration;
+	if (unlikely(duration > vcpu->arch.timing_max_duration[type]))
+		vcpu->arch.timing_max_duration[type] = duration;
+}
+
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
+{
+	u64 exit = vcpu->arch.timing_last_exit;
+	u64 enter = vcpu->arch.timing_last_enter.tv64;
+
+	/* save exit time, used next exit when the reenter time is known */
+	vcpu->arch.timing_last_exit = vcpu->arch.timing_exit.tv64;
+
+	if (unlikely(vcpu->arch.last_exit_type == 0xDEAD || exit == 0))
+		return; /* skip incomplete cycle (e.g. after reset) */
+
+	/* update statistics for average and standard deviation */
+	add_exit_timing(vcpu, (enter - exit), vcpu->arch.last_exit_type);
+	/* enter -> timing_last_exit is time spent in guest - log this too */
+	add_exit_timing(vcpu, (vcpu->arch.timing_last_exit - enter),
+			TIMEINGUEST);
+}
+
+static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
+	[MMIO_EXITS] = 			"MMIO",
+	[DCR_EXITS] =			"DCR",
+	[SIGNAL_EXITS] =		"SIGNAL",
+	[ITLB_REAL_MISS_EXITS] =	"ITLBREAL",
+	[ITLB_VIRT_MISS_EXITS] =	"ITLBVIRT",
+	[DTLB_REAL_MISS_EXITS] =	"DTLBREAL",
+	[DTLB_VIRT_MISS_EXITS] =	"DTLBVIRT",
+	[SYSCALL_EXITS] =		"SYSCALL",
+	[ISI_EXITS] =			"ISI",
+	[DSI_EXITS] =			"DSI",
+	[EMULATED_INST_EXITS] =		"EMULINST",
+	[EMULATED_MTMSRWE_EXITS] =	"EMUL_WAIT",
+	[EMULATED_WRTEE_EXITS] =	"EMUL_WRTEE",
+	[EMULATED_MTSPR_EXITS] =	"EMUL_MTSPR",
+	[EMULATED_MFSPR_EXITS] =	"EMUL_MFSPR",
+	[EMULATED_MTMSR_EXITS] =	"EMUL_MTMSR",
+	[EMULATED_MFMSR_EXITS] =	"EMUL_MFMSR",
+	[EMULATED_TLBSX_EXITS] =	"EMUL_TLBSX",
+	[EMULATED_TLBWE_EXITS] =	"EMUL_TLBWE",
+	[EMULATED_RFI_EXITS] =		"EMUL_RFI",
+	[DEC_EXITS] =			"DEC",
+	[EXT_INTR_EXITS] =		"EXTINT",
+	[HALT_WAKEUP] =			"HALT",
+	[USR_PR_INST] =			"USR_PR_INST",
+	[FP_UNAVAIL] =			"FP_UNAVAIL",
+	[DEBUG_EXITS] =			"DEBUG",
+	[TIMEINGUEST] =			"TIMEINGUEST"
+};
+
+static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
+{
+	struct kvm_vcpu *vcpu = m->private;
+	int i;
+	u64 min, max;
+
+	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
+		if (vcpu->arch.timing_min_duration[i] == 0xFFFFFFFF)
+			min = 0;
+		else
+			min = vcpu->arch.timing_min_duration[i];
+		if (vcpu->arch.timing_max_duration[i] == 0)
+			max = 0;
+		else
+			max = vcpu->arch.timing_max_duration[i];
+
+		seq_printf(m, "%12s: count %10d min %10lld "
+			"max %10lld sum %20lld sum_quad %20lld\n",
+			kvm_exit_names[i], vcpu->arch.timing_count_type[i],
+			vcpu->arch.timing_min_duration[i],
+			vcpu->arch.timing_max_duration[i],
+			vcpu->arch.timing_sum_duration[i],
+			vcpu->arch.timing_sum_quad_duration[i]);
+	}
+	return 0;
+}
+
+static ssize_t kvmppc_exit_timing_write(struct file *file,
+				       const char __user *user_buf,
+				       size_t count, loff_t *ppos)
+{
+	size_t len;
+	int err;
+	const char __user *p;
+	char c;
+
+	len = 0;
+	p = user_buf;
+	while (len < count) {
+		if (get_user(c, p++))
+			err = -EFAULT;
+		if (c == 0 || c == '\n')
+			break;
+		len++;
+	}
+
+	if (len > 1) {
+		err = -EINVAL;
+		goto done;
+	}
+
+	if (copy_from_user(&c, user_buf, sizeof(c))) {
+		err = -EFAULT;
+		goto done;
+	}
+
+	if (c == 'c') {
+		struct seq_file *seqf = (struct seq_file *)file->private_data;
+		struct kvm_vcpu *vcpu = seqf->private;
+		/* write does not affect out buffers previsously generated with
+		 * show. Seq file is locked here to prevent races of init with
+		 * a show call */
+		mutex_lock(&seqf->lock);
+		kvmppc_init_timing_stats(vcpu);
+		mutex_unlock(&seqf->lock);
+		err = count;
+	} else {
+		err = -EINVAL;
+		goto done;
+	}
+
+done:
+	return err;
+}
+
+static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, kvmppc_exit_timing_show, inode->i_private);
+}
+
+static struct file_operations kvmppc_exit_timing_fops = {
+	.owner   = THIS_MODULE,
+	.open    = kvmppc_exit_timing_open,
+	.read    = seq_read,
+	.write   = kvmppc_exit_timing_write,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id)
+{
+	static char dbg_fname[50];
+	struct dentry *debugfs_file;
+
+	snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%03u_timing",
+		 current->pid, id);
+	debugfs_file = debugfs_create_file(dbg_fname, 0666,
+					kvm_debugfs_dir, vcpu,
+					&kvmppc_exit_timing_fops);
+
+	if (!debugfs_file) {
+		printk(KERN_ERR"%s: error creating debugfs file %s\n",
+			__func__, dbg_fname);
+		return;
+	}
+
+	vcpu->arch.debugfs_exit_timing = debugfs_file;
+}
+
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->arch.debugfs_exit_timing) {
+		debugfs_remove(vcpu->arch.debugfs_exit_timing);
+		vcpu->arch.debugfs_exit_timing = NULL;
+	}
+}
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
new file mode 100644
index 000000000000..1af7181fa2b5
--- /dev/null
+++ b/arch/powerpc/kvm/timing.h
@@ -0,0 +1,102 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * Authors: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
+ */
+
+#ifndef __POWERPC_KVM_EXITTIMING_H__
+#define __POWERPC_KVM_EXITTIMING_H__
+
+#include <linux/kvm_host.h>
+#include <asm/kvm_host.h>
+
+#ifdef CONFIG_KVM_EXIT_TIMING
+void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu);
+void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id);
+void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu);
+
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type)
+{
+	vcpu->arch.last_exit_type = type;
+}
+
+#else
+/* if exit timing is not configured there is no need to build the c file */
+static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu,
+						unsigned int id) {}
+static inline void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) {}
+static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
+#endif /* CONFIG_KVM_EXIT_TIMING */
+
+/* account the exit in kvm_stats */
+static inline void account_exit_stat(struct kvm_vcpu *vcpu, int type)
+{
+	/* type has to be known at build time for optimization */
+	BUILD_BUG_ON(__builtin_constant_p(type));
+	switch (type) {
+	case EXT_INTR_EXITS:
+		vcpu->stat.ext_intr_exits++;
+		break;
+	case DEC_EXITS:
+		vcpu->stat.dec_exits++;
+		break;
+	case EMULATED_INST_EXITS:
+		vcpu->stat.emulated_inst_exits++;
+		break;
+	case DCR_EXITS:
+		vcpu->stat.dcr_exits++;
+		break;
+	case DSI_EXITS:
+		vcpu->stat.dsi_exits++;
+		break;
+	case ISI_EXITS:
+		vcpu->stat.isi_exits++;
+		break;
+	case SYSCALL_EXITS:
+		vcpu->stat.syscall_exits++;
+		break;
+	case DTLB_REAL_MISS_EXITS:
+		vcpu->stat.dtlb_real_miss_exits++;
+		break;
+	case DTLB_VIRT_MISS_EXITS:
+		vcpu->stat.dtlb_virt_miss_exits++;
+		break;
+	case MMIO_EXITS:
+		vcpu->stat.mmio_exits++;
+		break;
+	case ITLB_REAL_MISS_EXITS:
+		vcpu->stat.itlb_real_miss_exits++;
+		break;
+	case ITLB_VIRT_MISS_EXITS:
+		vcpu->stat.itlb_virt_miss_exits++;
+		break;
+	case SIGNAL_EXITS:
+		vcpu->stat.signal_exits++;
+		break;
+	}
+}
+
+/* wrapper to set exit time and account for it in kvm_stats */
+static inline void account_exit(struct kvm_vcpu *vcpu, int type)
+{
+	kvmppc_set_exit_type(vcpu, type);
+	account_exit_stat(vcpu, type);
+}
+
+#endif /* __POWERPC_KVM_EXITTIMING_H__ */

From 7b7015914b30ad8d9136d41412c5129b9bc9af70 Mon Sep 17 00:00:00 2001
From: Hollis Blanchard <hollisb@us.ibm.com>
Date: Tue, 2 Dec 2008 15:51:58 -0600
Subject: [PATCH 112/140] KVM: ppc: mostly cosmetic updates to the exit timing
 accounting code

The only significant changes were to kvmppc_exit_timing_write() and
kvmppc_exit_timing_show(), both of which were dramatically simplified.

Signed-off-by: Hollis Blanchard <hollisb@us.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/powerpc/include/asm/kvm_host.h |   8 +-
 arch/powerpc/kvm/44x_emulate.c      |   4 +-
 arch/powerpc/kvm/booke.c            |  30 ++++----
 arch/powerpc/kvm/timing.c           | 109 +++++++++++-----------------
 arch/powerpc/kvm/timing.h           |   6 +-
 5 files changed, 66 insertions(+), 91 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 2f5b49f2a98e..c1e436fe7738 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -102,9 +102,8 @@ enum kvm_exit_types {
 	__NUMBER_OF_KVM_EXIT_TYPES
 };
 
-#ifdef CONFIG_KVM_EXIT_TIMING
 /* allow access to big endian 32bit upper/lower parts and 64bit var */
-struct exit_timing {
+struct kvmppc_exit_timing {
 	union {
 		u64 tv64;
 		struct {
@@ -112,7 +111,6 @@ struct exit_timing {
 		} tv32;
 	};
 };
-#endif
 
 struct kvm_arch {
 };
@@ -174,8 +172,8 @@ struct kvm_vcpu_arch {
 	u32 dbcr1;
 
 #ifdef CONFIG_KVM_EXIT_TIMING
-	struct exit_timing timing_exit;
-	struct exit_timing timing_last_enter;
+	struct kvmppc_exit_timing timing_exit;
+	struct kvmppc_exit_timing timing_last_enter;
 	u32 last_exit_type;
 	u32 timing_count_type[__NUMBER_OF_KVM_EXIT_TYPES];
 	u64 timing_sum_duration[__NUMBER_OF_KVM_EXIT_TYPES];
diff --git a/arch/powerpc/kvm/44x_emulate.c b/arch/powerpc/kvm/44x_emulate.c
index 69f88d53c428..82489a743a6f 100644
--- a/arch/powerpc/kvm/44x_emulate.c
+++ b/arch/powerpc/kvm/44x_emulate.c
@@ -132,7 +132,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				run->dcr.is_write = 0;
 				vcpu->arch.io_gpr = rt;
 				vcpu->arch.dcr_needed = 1;
-				account_exit(vcpu, DCR_EXITS);
+				kvmppc_account_exit(vcpu, DCR_EXITS);
 				emulated = EMULATE_DO_DCR;
 			}
 
@@ -152,7 +152,7 @@ int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				run->dcr.data = vcpu->arch.gpr[rs];
 				run->dcr.is_write = 1;
 				vcpu->arch.dcr_needed = 1;
-				account_exit(vcpu, DCR_EXITS);
+				kvmppc_account_exit(vcpu, DCR_EXITS);
 				emulated = EMULATE_DO_DCR;
 			}
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 0f171248e450..35485dd6927e 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -202,7 +202,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		break;
 
 	case BOOKE_INTERRUPT_EXTERNAL:
-		account_exit(vcpu, EXT_INTR_EXITS);
+		kvmppc_account_exit(vcpu, EXT_INTR_EXITS);
 		if (need_resched())
 			cond_resched();
 		r = RESUME_GUEST;
@@ -212,7 +212,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		/* Since we switched IVPR back to the host's value, the host
 		 * handled this interrupt the moment we enabled interrupts.
 		 * Now we just offer it a chance to reschedule the guest. */
-		account_exit(vcpu, DEC_EXITS);
+		kvmppc_account_exit(vcpu, DEC_EXITS);
 		if (need_resched())
 			cond_resched();
 		r = RESUME_GUEST;
@@ -225,7 +225,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			vcpu->arch.esr = vcpu->arch.fault_esr;
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM);
 			r = RESUME_GUEST;
-			account_exit(vcpu, USR_PR_INST);
+			kvmppc_account_exit(vcpu, USR_PR_INST);
 			break;
 		}
 
@@ -233,7 +233,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		switch (er) {
 		case EMULATE_DONE:
 			/* don't overwrite subtypes, just account kvm_stats */
-			account_exit_stat(vcpu, EMULATED_INST_EXITS);
+			kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS);
 			/* Future optimization: only reload non-volatiles if
 			 * they were actually modified by emulation. */
 			r = RESUME_GUEST_NV;
@@ -259,7 +259,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	case BOOKE_INTERRUPT_FP_UNAVAIL:
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL);
-		account_exit(vcpu, FP_UNAVAIL);
+		kvmppc_account_exit(vcpu, FP_UNAVAIL);
 		r = RESUME_GUEST;
 		break;
 
@@ -267,20 +267,20 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		vcpu->arch.dear = vcpu->arch.fault_dear;
 		vcpu->arch.esr = vcpu->arch.fault_esr;
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE);
-		account_exit(vcpu, DSI_EXITS);
+		kvmppc_account_exit(vcpu, DSI_EXITS);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_INST_STORAGE:
 		vcpu->arch.esr = vcpu->arch.fault_esr;
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE);
-		account_exit(vcpu, ISI_EXITS);
+		kvmppc_account_exit(vcpu, ISI_EXITS);
 		r = RESUME_GUEST;
 		break;
 
 	case BOOKE_INTERRUPT_SYSCALL:
 		kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL);
-		account_exit(vcpu, SYSCALL_EXITS);
+		kvmppc_account_exit(vcpu, SYSCALL_EXITS);
 		r = RESUME_GUEST;
 		break;
 
@@ -299,7 +299,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS);
 			vcpu->arch.dear = vcpu->arch.fault_dear;
 			vcpu->arch.esr = vcpu->arch.fault_esr;
-			account_exit(vcpu, DTLB_REAL_MISS_EXITS);
+			kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS);
 			r = RESUME_GUEST;
 			break;
 		}
@@ -317,13 +317,13 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			 * invoking the guest. */
 			kvmppc_mmu_map(vcpu, eaddr, vcpu->arch.paddr_accessed, gtlbe->tid,
 			               gtlbe->word2, get_tlb_bytes(gtlbe), gtlb_index);
-			account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
+			kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS);
 			r = RESUME_GUEST;
 		} else {
 			/* Guest has mapped and accessed a page which is not
 			 * actually RAM. */
 			r = kvmppc_emulate_mmio(run, vcpu);
-			account_exit(vcpu, MMIO_EXITS);
+			kvmppc_account_exit(vcpu, MMIO_EXITS);
 		}
 
 		break;
@@ -345,11 +345,11 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (gtlb_index < 0) {
 			/* The guest didn't have a mapping for it. */
 			kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS);
-			account_exit(vcpu, ITLB_REAL_MISS_EXITS);
+			kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS);
 			break;
 		}
 
-		account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
+		kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS);
 
 		gtlbe = &vcpu_44x->guest_tlb[gtlb_index];
 		gpaddr = tlb_xlate(gtlbe, eaddr);
@@ -383,7 +383,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		mtspr(SPRN_DBSR, dbsr);
 
 		run->exit_reason = KVM_EXIT_DEBUG;
-		account_exit(vcpu, DEBUG_EXITS);
+		kvmppc_account_exit(vcpu, DEBUG_EXITS);
 		r = RESUME_HOST;
 		break;
 	}
@@ -404,7 +404,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		if (signal_pending(current)) {
 			run->exit_reason = KVM_EXIT_INTR;
 			r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV);
-			account_exit(vcpu, SIGNAL_EXITS);
+			kvmppc_account_exit(vcpu, SIGNAL_EXITS);
 		}
 	}
 
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index f42d2728a6a5..47ee603f558e 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -12,7 +12,7 @@
  * along with this program; if not, write to the Free Software
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  *
- * Copyright IBM Corp. 2007
+ * Copyright IBM Corp. 2008
  *
  * Authors: Hollis Blanchard <hollisb@us.ibm.com>
  *          Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com>
@@ -24,10 +24,11 @@
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 
-#include "timing.h"
 #include <asm/time.h>
 #include <asm-generic/div64.h>
 
+#include "timing.h"
+
 void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -52,8 +53,7 @@ void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu)
 	local_irq_enable();
 }
 
-static void add_exit_timing(struct kvm_vcpu *vcpu,
-					u64 duration, int type)
+static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type)
 {
 	u64 old;
 
@@ -115,54 +115,46 @@ void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu)
 }
 
 static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
-	[MMIO_EXITS] = 			"MMIO",
-	[DCR_EXITS] =			"DCR",
-	[SIGNAL_EXITS] =		"SIGNAL",
-	[ITLB_REAL_MISS_EXITS] =	"ITLBREAL",
-	[ITLB_VIRT_MISS_EXITS] =	"ITLBVIRT",
-	[DTLB_REAL_MISS_EXITS] =	"DTLBREAL",
-	[DTLB_VIRT_MISS_EXITS] =	"DTLBVIRT",
-	[SYSCALL_EXITS] =		"SYSCALL",
-	[ISI_EXITS] =			"ISI",
-	[DSI_EXITS] =			"DSI",
-	[EMULATED_INST_EXITS] =		"EMULINST",
-	[EMULATED_MTMSRWE_EXITS] =	"EMUL_WAIT",
-	[EMULATED_WRTEE_EXITS] =	"EMUL_WRTEE",
-	[EMULATED_MTSPR_EXITS] =	"EMUL_MTSPR",
-	[EMULATED_MFSPR_EXITS] =	"EMUL_MFSPR",
-	[EMULATED_MTMSR_EXITS] =	"EMUL_MTMSR",
-	[EMULATED_MFMSR_EXITS] =	"EMUL_MFMSR",
-	[EMULATED_TLBSX_EXITS] =	"EMUL_TLBSX",
-	[EMULATED_TLBWE_EXITS] =	"EMUL_TLBWE",
-	[EMULATED_RFI_EXITS] =		"EMUL_RFI",
-	[DEC_EXITS] =			"DEC",
-	[EXT_INTR_EXITS] =		"EXTINT",
-	[HALT_WAKEUP] =			"HALT",
-	[USR_PR_INST] =			"USR_PR_INST",
-	[FP_UNAVAIL] =			"FP_UNAVAIL",
-	[DEBUG_EXITS] =			"DEBUG",
-	[TIMEINGUEST] =			"TIMEINGUEST"
+	[MMIO_EXITS] =              "MMIO",
+	[DCR_EXITS] =               "DCR",
+	[SIGNAL_EXITS] =            "SIGNAL",
+	[ITLB_REAL_MISS_EXITS] =    "ITLBREAL",
+	[ITLB_VIRT_MISS_EXITS] =    "ITLBVIRT",
+	[DTLB_REAL_MISS_EXITS] =    "DTLBREAL",
+	[DTLB_VIRT_MISS_EXITS] =    "DTLBVIRT",
+	[SYSCALL_EXITS] =           "SYSCALL",
+	[ISI_EXITS] =               "ISI",
+	[DSI_EXITS] =               "DSI",
+	[EMULATED_INST_EXITS] =     "EMULINST",
+	[EMULATED_MTMSRWE_EXITS] =  "EMUL_WAIT",
+	[EMULATED_WRTEE_EXITS] =    "EMUL_WRTEE",
+	[EMULATED_MTSPR_EXITS] =    "EMUL_MTSPR",
+	[EMULATED_MFSPR_EXITS] =    "EMUL_MFSPR",
+	[EMULATED_MTMSR_EXITS] =    "EMUL_MTMSR",
+	[EMULATED_MFMSR_EXITS] =    "EMUL_MFMSR",
+	[EMULATED_TLBSX_EXITS] =    "EMUL_TLBSX",
+	[EMULATED_TLBWE_EXITS] =    "EMUL_TLBWE",
+	[EMULATED_RFI_EXITS] =      "EMUL_RFI",
+	[DEC_EXITS] =               "DEC",
+	[EXT_INTR_EXITS] =          "EXTINT",
+	[HALT_WAKEUP] =             "HALT",
+	[USR_PR_INST] =             "USR_PR_INST",
+	[FP_UNAVAIL] =              "FP_UNAVAIL",
+	[DEBUG_EXITS] =             "DEBUG",
+	[TIMEINGUEST] =             "TIMEINGUEST"
 };
 
 static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
 {
 	struct kvm_vcpu *vcpu = m->private;
 	int i;
-	u64 min, max;
+
+	seq_printf(m, "%s", "type	count	min	max	sum	sum_squared\n");
 
 	for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) {
-		if (vcpu->arch.timing_min_duration[i] == 0xFFFFFFFF)
-			min = 0;
-		else
-			min = vcpu->arch.timing_min_duration[i];
-		if (vcpu->arch.timing_max_duration[i] == 0)
-			max = 0;
-		else
-			max = vcpu->arch.timing_max_duration[i];
-
-		seq_printf(m, "%12s: count %10d min %10lld "
-			"max %10lld sum %20lld sum_quad %20lld\n",
-			kvm_exit_names[i], vcpu->arch.timing_count_type[i],
+		seq_printf(m, "%12s	%10d	%10lld	%10lld	%20lld	%20lld\n",
+			kvm_exit_names[i],
+			vcpu->arch.timing_count_type[i],
 			vcpu->arch.timing_min_duration[i],
 			vcpu->arch.timing_max_duration[i],
 			vcpu->arch.timing_sum_duration[i],
@@ -171,31 +163,19 @@ static int kvmppc_exit_timing_show(struct seq_file *m, void *private)
 	return 0;
 }
 
+/* Write 'c' to clear the timing statistics. */
 static ssize_t kvmppc_exit_timing_write(struct file *file,
 				       const char __user *user_buf,
 				       size_t count, loff_t *ppos)
 {
-	size_t len;
-	int err;
-	const char __user *p;
+	int err = -EINVAL;
 	char c;
 
-	len = 0;
-	p = user_buf;
-	while (len < count) {
-		if (get_user(c, p++))
-			err = -EFAULT;
-		if (c == 0 || c == '\n')
-			break;
-		len++;
-	}
-
-	if (len > 1) {
-		err = -EINVAL;
+	if (count > 1) {
 		goto done;
 	}
 
-	if (copy_from_user(&c, user_buf, sizeof(c))) {
+	if (get_user(c, user_buf)) {
 		err = -EFAULT;
 		goto done;
 	}
@@ -203,16 +183,13 @@ static ssize_t kvmppc_exit_timing_write(struct file *file,
 	if (c == 'c') {
 		struct seq_file *seqf = (struct seq_file *)file->private_data;
 		struct kvm_vcpu *vcpu = seqf->private;
-		/* write does not affect out buffers previsously generated with
-		 * show. Seq file is locked here to prevent races of init with
+		/* Write does not affect our buffers previously generated with
+		 * show. seq_file is locked here to prevent races of init with
 		 * a show call */
 		mutex_lock(&seqf->lock);
 		kvmppc_init_timing_stats(vcpu);
 		mutex_unlock(&seqf->lock);
 		err = count;
-	} else {
-		err = -EINVAL;
-		goto done;
 	}
 
 done:
@@ -238,7 +215,7 @@ void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id)
 	static char dbg_fname[50];
 	struct dentry *debugfs_file;
 
-	snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%03u_timing",
+	snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing",
 		 current->pid, id);
 	debugfs_file = debugfs_create_file(dbg_fname, 0666,
 					kvm_debugfs_dir, vcpu,
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index 1af7181fa2b5..bb13b1f3cd5a 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -45,7 +45,7 @@ static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {}
 #endif /* CONFIG_KVM_EXIT_TIMING */
 
 /* account the exit in kvm_stats */
-static inline void account_exit_stat(struct kvm_vcpu *vcpu, int type)
+static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type)
 {
 	/* type has to be known at build time for optimization */
 	BUILD_BUG_ON(__builtin_constant_p(type));
@@ -93,10 +93,10 @@ static inline void account_exit_stat(struct kvm_vcpu *vcpu, int type)
 }
 
 /* wrapper to set exit time and account for it in kvm_stats */
-static inline void account_exit(struct kvm_vcpu *vcpu, int type)
+static inline void kvmppc_account_exit(struct kvm_vcpu *vcpu, int type)
 {
 	kvmppc_set_exit_type(vcpu, type);
-	account_exit_stat(vcpu, type);
+	kvmppc_account_exit_stat(vcpu, type);
 }
 
 #endif /* __POWERPC_KVM_EXITTIMING_H__ */

From 6f89724829cfd4ad6771a92fd4b8d59c90c7220c Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Wed, 3 Dec 2008 13:40:51 -0200
Subject: [PATCH 113/140] KVM: Really remove a slot when a user ask us so

Right now, KVM does not remove a slot when we do a
register ioctl for size 0 (would be the expected behaviour).

Instead, we only mark it as empty, but keep all bitmaps
and allocated data structures present. It completely
nullifies our chances of reusing that same slot again
for mapping a different piece of memory.

In this patch, we destroy rmaps, and vfree() the
pointers that used to hold the dirty bitmap, rmap
and lpage_info structures.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e41d39d6b0ef..fd9cc79092cb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1020,7 +1020,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		goto out_free;
 	}
 
-	kvm_free_physmem_slot(&old, &new);
+	kvm_free_physmem_slot(&old, npages ? &new : NULL);
+	/* Slot deletion case: we have to update the current slot */
+	if (!npages)
+		*memslot = old;
 #ifdef CONFIG_DMAR
 	/* map the pages in iommu page table */
 	r = kvm_iommu_map_pages(kvm, base_gfn, npages);

From 45ed60b371aeae6ed80f7e9d594a5e6412edc176 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Thu, 4 Dec 2008 14:25:38 +0100
Subject: [PATCH 114/140] KVM: x86 emulator: Extend the opcode descriptor

Extend the opcode descriptor to 32 bits. This is needed by the
introduction of a new Src2 operand type.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 69b330ba0ad0..7a07ca46c8ae 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -76,7 +76,7 @@ enum {
 	Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
 };
 
-static u16 opcode_table[256] = {
+static u32 opcode_table[256] = {
 	/* 0x00 - 0x07 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -195,7 +195,7 @@ static u16 opcode_table[256] = {
 	ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
 };
 
-static u16 twobyte_table[256] = {
+static u32 twobyte_table[256] = {
 	/* 0x00 - 0x0F */
 	0, Group | GroupDual | Group7, 0, 0, 0, 0, ImplicitOps, 0,
 	ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0,
@@ -253,7 +253,7 @@ static u16 twobyte_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
 };
 
-static u16 group_table[] = {
+static u32 group_table[] = {
 	[Group1_80*8] =
 	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
 	ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM,
@@ -297,7 +297,7 @@ static u16 group_table[] = {
 	SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp,
 };
 
-static u16 group2_table[] = {
+static u32 group2_table[] = {
 	[Group7*8] =
 	SrcNone | ModRM, 0, 0, 0,
 	SrcNone | ModRM | DstMem | Mov, 0,

From 0dc8d10f7d848b63c8d32cf6fd31ba7def792ac9 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Thu, 4 Dec 2008 14:26:42 +0100
Subject: [PATCH 115/140] KVM: x86 emulator: add Src2 decode set

Instruction like shld has three operands, so we need to add a Src2
decode set. We start with Src2None, Src2CL, and Src2ImmByte, Src2One to
support shld/shrd and we will expand it later.

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_x86_emulate.h |  1 +
 arch/x86/kvm/x86_emulate.c             | 29 ++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 16a002655f31..6a159732881a 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -123,6 +123,7 @@ struct decode_cache {
 	u8 ad_bytes;
 	u8 rex_prefix;
 	struct operand src;
+	struct operand src2;
 	struct operand dst;
 	bool has_seg_override;
 	u8 seg_override;
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 7a07ca46c8ae..7f5cd62362c5 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -70,6 +70,12 @@
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
 #define GroupDual   (1<<15)     /* Alternate decoding of mod == 3 */
 #define GroupMask   0xff        /* Group number stored in bits 0:7 */
+/* Source 2 operand type */
+#define Src2None    (0<<29)
+#define Src2CL      (1<<29)
+#define Src2ImmByte (2<<29)
+#define Src2One     (3<<29)
+#define Src2Mask    (7<<29)
 
 enum {
 	Group1_80, Group1_81, Group1_82, Group1_83,
@@ -1000,6 +1006,29 @@ done_prefixes:
 		break;
 	}
 
+	/*
+	 * Decode and fetch the second source operand: register, memory
+	 * or immediate.
+	 */
+	switch (c->d & Src2Mask) {
+	case Src2None:
+		break;
+	case Src2CL:
+		c->src2.bytes = 1;
+		c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
+		break;
+	case Src2ImmByte:
+		c->src2.type = OP_IMM;
+		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.bytes = 1;
+		c->src2.val = insn_fetch(u8, 1, c->eip);
+		break;
+	case Src2One:
+		c->src2.bytes = 1;
+		c->src2.val = 1;
+		break;
+	}
+
 	/* Decode and fetch the destination operand: register or memory. */
 	switch (c->d & DstMask) {
 	case ImplicitOps:

From bfcadf83ec5aafe600e73dd427d997db7bcc1d12 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Thu, 4 Dec 2008 14:27:38 +0100
Subject: [PATCH 116/140] KVM: x86 emulator: add a new "implied 1" Src decode
 type

Add SrcOne operand type when we need to decode an implied '1' like with
regular shift instruction

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 7f5cd62362c5..0c75306e7a07 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -58,6 +58,7 @@
 #define SrcMem32    (4<<4)	/* Memory operand (32-bit). */
 #define SrcImm      (5<<4)	/* Immediate operand. */
 #define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
+#define SrcOne      (7<<4)	/* Implied '1' */
 #define SrcMask     (7<<4)
 /* Generic ModRM decode. */
 #define ModRM       (1<<7)
@@ -1004,6 +1005,10 @@ done_prefixes:
 		c->src.bytes = 1;
 		c->src.val = insn_fetch(s8, 1, c->eip);
 		break;
+	case SrcOne:
+		c->src.bytes = 1;
+		c->src.val = 1;
+		break;
 	}
 
 	/*

From d175226a5f54817ba427368c6b739aefa7780fb2 Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Thu, 4 Dec 2008 14:29:00 +0100
Subject: [PATCH 117/140] KVM: x86 emulator: add the assembler code for three
 operands

Add the assembler code for instruction with three operands and one
operand is stored in ECX register

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 39 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0c75306e7a07..9ae6d5b3e962 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -431,6 +431,45 @@ static u32 group2_table[] = {
 	__emulate_2op_nobyte(_op, _src, _dst, _eflags,			\
 			     "w", "r", _LO32, "r", "", "r")
 
+/* Instruction has three operands and one operand is stored in ECX register */
+#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) 	\
+	do {									\
+		unsigned long _tmp;						\
+		_type _clv  = (_cl).val;  					\
+		_type _srcv = (_src).val;    					\
+		_type _dstv = (_dst).val;					\
+										\
+		__asm__ __volatile__ (						\
+			_PRE_EFLAGS("0", "5", "2")				\
+			_op _suffix " %4,%1 \n"					\
+			_POST_EFLAGS("0", "5", "2")				\
+			: "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp)		\
+			: "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK)		\
+			); 							\
+										\
+		(_cl).val  = (unsigned long) _clv;				\
+		(_src).val = (unsigned long) _srcv;				\
+		(_dst).val = (unsigned long) _dstv;				\
+	} while (0)
+
+#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags)				\
+	do {									\
+		switch ((_dst).bytes) {						\
+		case 2:								\
+			__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,  	\
+						"w", unsigned short);         	\
+			break;							\
+		case 4: 							\
+			__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,  	\
+						"l", unsigned int);           	\
+			break;							\
+		case 8:								\
+			ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags,	\
+						"q", unsigned long));  		\
+			break;							\
+		}								\
+	} while (0)
+
 #define __emulate_1op(_op, _dst, _eflags, _suffix)			\
 	do {								\
 		unsigned long _tmp;					\

From 9bf8ea42fe22d7d1c48044148fa658cb9083d49c Mon Sep 17 00:00:00 2001
From: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Date: Thu, 4 Dec 2008 14:30:13 +0100
Subject: [PATCH 118/140] KVM: x86 emulator: add the emulation of shld and shrd
 instructions

Add emulation of shld and shrd instructions

Signed-off-by: Guillaume Thouvenin <guillaume.thouvenin@ext.bull.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 9ae6d5b3e962..219dc3110bf1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -237,9 +237,14 @@ static u32 twobyte_table[256] = {
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+	DstMem | SrcReg | Src2ImmByte | ModRM,
+	DstMem | SrcReg | Src2CL | ModRM, 0, 0,
 	/* 0xA8 - 0xAF */
-	0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
+	0, 0, 0, DstMem | SrcReg | ModRM | BitOp,
+	DstMem | SrcReg | Src2ImmByte | ModRM,
+	DstMem | SrcReg | Src2CL | ModRM,
+	ModRM, 0,
 	/* 0xB0 - 0xB7 */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
 	    DstMem | SrcReg | ModRM | BitOp,
@@ -2037,12 +2042,20 @@ twobyte_insn:
 		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bt", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xa4: /* shld imm8, r, r/m */
+	case 0xa5: /* shld cl, r, r/m */
+		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
+		break;
 	case 0xab:
 	      bts:		/* bts */
 		/* only subword offset */
 		c->src.val &= (c->dst.bytes << 3) - 1;
 		emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
 		break;
+	case 0xac: /* shrd imm8, r, r/m */
+	case 0xad: /* shrd cl, r, r/m */
+		emulate_2op_cl("shrd", c->src2, c->src, c->dst, ctxt->eflags);
+		break;
 	case 0xae:              /* clflush */
 		break;
 	case 0xb0 ... 0xb1:	/* cmpxchg */

From fbce554e940a983d005e29849636d0ef54b3eb18 Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Thu, 4 Dec 2008 11:11:40 +0000
Subject: [PATCH 119/140] KVM: x86 emulator: Fix handling of VMMCALL
 instruction

The VMMCALL instruction doesn't get recognised and isn't processed
by the emulator.

This is seen on an Intel host that tries to execute the VMMCALL
instruction after a guest live migrates from an AMD host.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 219dc3110bf1..d174db7a3370 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -311,7 +311,7 @@ static u32 group_table[] = {
 
 static u32 group2_table[] = {
 	[Group7*8] =
-	SrcNone | ModRM, 0, 0, 0,
+	SrcNone | ModRM, 0, 0, SrcNone | ModRM,
 	SrcNone | ModRM | DstMem | Mov, 0,
 	SrcMem16 | ModRM | Mov, 0,
 };

From 60c8aec6e2c9923492dabbd6b67e34692bd26c20 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 1 Dec 2008 22:32:02 -0200
Subject: [PATCH 120/140] KVM: MMU: use page array in unsync walk

Instead of invoking the handler directly collect pages into
an array so the caller can work with it.

Simplifies TLB flush collapsing.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 +-
 arch/x86/kvm/mmu.c              | 197 +++++++++++++++++++++++---------
 2 files changed, 142 insertions(+), 57 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f58f7ebdea81..93d0aed35880 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -200,7 +200,7 @@ struct kvm_mmu_page {
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
-	bool unsync_children;
+	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
 		struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dd20b199a7c0..7ce92f78f337 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -908,8 +908,9 @@ static void kvm_mmu_update_unsync_bitmap(u64 *spte)
 	struct kvm_mmu_page *sp = page_header(__pa(spte));
 
 	index = spte - sp->spt;
-	__set_bit(index, sp->unsync_child_bitmap);
-	sp->unsync_children = 1;
+	if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
+		sp->unsync_children++;
+	WARN_ON(!sp->unsync_children);
 }
 
 static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
@@ -936,7 +937,6 @@ static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
 
 static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
-	sp->unsync_children = 1;
 	kvm_mmu_update_parents_unsync(sp);
 	return 1;
 }
@@ -967,18 +967,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
 }
 
+#define KVM_PAGE_ARRAY_NR 16
+
+struct kvm_mmu_pages {
+	struct mmu_page_and_offset {
+		struct kvm_mmu_page *sp;
+		unsigned int idx;
+	} page[KVM_PAGE_ARRAY_NR];
+	unsigned int nr;
+};
+
 #define for_each_unsync_children(bitmap, idx)		\
 	for (idx = find_first_bit(bitmap, 512);		\
 	     idx < 512;					\
 	     idx = find_next_bit(bitmap, 512, idx+1))
 
-static int mmu_unsync_walk(struct kvm_mmu_page *sp,
-			   struct kvm_unsync_walk *walker)
+int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
+		   int idx)
 {
-	int i, ret;
+	int i;
 
-	if (!sp->unsync_children)
-		return 0;
+	if (sp->unsync)
+		for (i=0; i < pvec->nr; i++)
+			if (pvec->page[i].sp == sp)
+				return 0;
+
+	pvec->page[pvec->nr].sp = sp;
+	pvec->page[pvec->nr].idx = idx;
+	pvec->nr++;
+	return (pvec->nr == KVM_PAGE_ARRAY_NR);
+}
+
+static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
+			   struct kvm_mmu_pages *pvec)
+{
+	int i, ret, nr_unsync_leaf = 0;
 
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
 		u64 ent = sp->spt[i];
@@ -988,17 +1011,22 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 			child = page_header(ent & PT64_BASE_ADDR_MASK);
 
 			if (child->unsync_children) {
-				ret = mmu_unsync_walk(child, walker);
-				if (ret)
+				if (mmu_pages_add(pvec, child, i))
+					return -ENOSPC;
+
+				ret = __mmu_unsync_walk(child, pvec);
+				if (!ret)
+					__clear_bit(i, sp->unsync_child_bitmap);
+				else if (ret > 0)
+					nr_unsync_leaf += ret;
+				else
 					return ret;
-				__clear_bit(i, sp->unsync_child_bitmap);
 			}
 
 			if (child->unsync) {
-				ret = walker->entry(child, walker);
-				__clear_bit(i, sp->unsync_child_bitmap);
-				if (ret)
-					return ret;
+				nr_unsync_leaf++;
+				if (mmu_pages_add(pvec, child, i))
+					return -ENOSPC;
 			}
 		}
 	}
@@ -1006,7 +1034,17 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
 		sp->unsync_children = 0;
 
-	return 0;
+	return nr_unsync_leaf;
+}
+
+static int mmu_unsync_walk(struct kvm_mmu_page *sp,
+			   struct kvm_mmu_pages *pvec)
+{
+	if (!sp->unsync_children)
+		return 0;
+
+	mmu_pages_add(pvec, sp, 0);
+	return __mmu_unsync_walk(sp, pvec);
 }
 
 static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
@@ -1056,30 +1094,81 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return 0;
 }
 
-struct sync_walker {
-	struct kvm_vcpu *vcpu;
-	struct kvm_unsync_walk walker;
+struct mmu_page_path {
+	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
+	unsigned int idx[PT64_ROOT_LEVEL-1];
 };
 
-static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
-{
-	struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
-						     walker);
-	struct kvm_vcpu *vcpu = sync_walk->vcpu;
+#define for_each_sp(pvec, sp, parents, i)			\
+		for (i = mmu_pages_next(&pvec, &parents, -1),	\
+			sp = pvec.page[i].sp;			\
+			i < pvec.nr && ({ sp = pvec.page[i].sp; 1;});	\
+			i = mmu_pages_next(&pvec, &parents, i))
 
-	kvm_sync_page(vcpu, sp);
-	return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
+int mmu_pages_next(struct kvm_mmu_pages *pvec, struct mmu_page_path *parents,
+		   int i)
+{
+	int n;
+
+	for (n = i+1; n < pvec->nr; n++) {
+		struct kvm_mmu_page *sp = pvec->page[n].sp;
+
+		if (sp->role.level == PT_PAGE_TABLE_LEVEL) {
+			parents->idx[0] = pvec->page[n].idx;
+			return n;
+		}
+
+		parents->parent[sp->role.level-2] = sp;
+		parents->idx[sp->role.level-1] = pvec->page[n].idx;
+	}
+
+	return n;
 }
 
-static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+void mmu_pages_clear_parents(struct mmu_page_path *parents)
 {
-	struct sync_walker walker = {
-		.walker = { .entry = mmu_sync_fn, },
-		.vcpu = vcpu,
-	};
+	struct kvm_mmu_page *sp;
+	unsigned int level = 0;
 
-	while (mmu_unsync_walk(sp, &walker.walker))
+	do {
+		unsigned int idx = parents->idx[level];
+
+		sp = parents->parent[level];
+		if (!sp)
+			return;
+
+		--sp->unsync_children;
+		WARN_ON((int)sp->unsync_children < 0);
+		__clear_bit(idx, sp->unsync_child_bitmap);
+		level++;
+	} while (level < PT64_ROOT_LEVEL-1 && !sp->unsync_children);
+}
+
+static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
+			       struct mmu_page_path *parents,
+			       struct kvm_mmu_pages *pvec)
+{
+	parents->parent[parent->role.level-1] = NULL;
+	pvec->nr = 0;
+}
+
+static void mmu_sync_children(struct kvm_vcpu *vcpu,
+			      struct kvm_mmu_page *parent)
+{
+	int i;
+	struct kvm_mmu_page *sp;
+	struct mmu_page_path parents;
+	struct kvm_mmu_pages pages;
+
+	kvm_mmu_pages_init(parent, &parents, &pages);
+	while (mmu_unsync_walk(parent, &pages)) {
+		for_each_sp(pages, sp, parents, i) {
+			kvm_sync_page(vcpu, sp);
+			mmu_pages_clear_parents(&parents);
+		}
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_pages_init(parent, &parents, &pages);
+	}
 }
 
 static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1245,33 +1334,29 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 	}
 }
 
-struct zap_walker {
-	struct kvm_unsync_walk walker;
-	struct kvm *kvm;
-	int zapped;
-};
-
-static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
+static int mmu_zap_unsync_children(struct kvm *kvm,
+				   struct kvm_mmu_page *parent)
 {
-	struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
-						     walker);
-	kvm_mmu_zap_page(zap_walk->kvm, sp);
-	zap_walk->zapped = 1;
-	return 0;
-}
+	int i, zapped = 0;
+	struct mmu_page_path parents;
+	struct kvm_mmu_pages pages;
 
-static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	struct zap_walker walker = {
-		.walker = { .entry = mmu_zap_fn, },
-		.kvm = kvm,
-		.zapped = 0,
-	};
-
-	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
+	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
 		return 0;
-	mmu_unsync_walk(sp, &walker.walker);
-	return walker.zapped;
+
+	kvm_mmu_pages_init(parent, &parents, &pages);
+	while (mmu_unsync_walk(parent, &pages)) {
+		struct kvm_mmu_page *sp;
+
+		for_each_sp(pages, sp, parents, i) {
+			kvm_mmu_zap_page(kvm, sp);
+			mmu_pages_clear_parents(&parents);
+		}
+		zapped += pages.nr;
+		kvm_mmu_pages_init(parent, &parents, &pages);
+	}
+
+	return zapped;
 }
 
 static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)

From b1a368218ad5b6e62380c8f206f16e6f18bf154c Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 1 Dec 2008 22:32:03 -0200
Subject: [PATCH 121/140] KVM: MMU: collapse remote TLB flushes on root sync

Collapse remote TLB flushes on root sync.

kernbench is 2.7% faster on 4-way guest. Improvements have been seen
with other loads such as AIM7.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7ce92f78f337..58c35dead321 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -621,7 +621,7 @@ static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 	return NULL;
 }
 
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 {
 	unsigned long *rmapp;
 	u64 *spte;
@@ -667,8 +667,7 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
 		spte = rmap_next(kvm, rmapp, spte);
 	}
 
-	if (write_protected)
-		kvm_flush_remote_tlbs(kvm);
+	return write_protected;
 }
 
 static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -1083,7 +1082,8 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		return 1;
 	}
 
-	rmap_write_protect(vcpu->kvm, sp->gfn);
+	if (rmap_write_protect(vcpu->kvm, sp->gfn))
+		kvm_flush_remote_tlbs(vcpu->kvm);
 	kvm_unlink_unsync_page(vcpu->kvm, sp);
 	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
 		kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1162,6 +1162,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 
 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
+		int protected = 0;
+
+		for_each_sp(pages, sp, parents, i)
+			protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+
+		if (protected)
+			kvm_flush_remote_tlbs(vcpu->kvm);
+
 		for_each_sp(pages, sp, parents, i) {
 			kvm_sync_page(vcpu, sp);
 			mmu_pages_clear_parents(&parents);
@@ -1226,7 +1234,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	sp->role = role;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!metaphysical) {
-		rmap_write_protect(vcpu->kvm, gfn);
+		if (rmap_write_protect(vcpu->kvm, gfn))
+			kvm_flush_remote_tlbs(vcpu->kvm);
 		account_shadowed(vcpu->kvm, gfn);
 	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)

From 6cffe8ca4a2adf1ac5003d9cad08fe4434d6eee0 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 1 Dec 2008 22:32:04 -0200
Subject: [PATCH 122/140] KVM: MMU: skip global pgtables on sync due to cr3
 switch

Skip syncing global pages on cr3 switch (but not on cr4/cr0). This is
important for Linux 32-bit guests with PAE, where the kmap page is
marked as global.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  6 ++++
 arch/x86/kvm/mmu.c              | 53 +++++++++++++++++++++++++++++----
 arch/x86/kvm/paging_tmpl.h      | 10 ++++---
 arch/x86/kvm/x86.c              |  4 +++
 4 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 93d0aed35880..65b1ed295698 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -182,6 +182,8 @@ struct kvm_mmu_page {
 	struct list_head link;
 	struct hlist_node hash_link;
 
+	struct list_head oos_link;
+
 	/*
 	 * The following two entries are used to key the shadow page in the
 	 * hash table.
@@ -200,6 +202,7 @@ struct kvm_mmu_page {
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
+	bool global;
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
@@ -356,6 +359,7 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
+	struct list_head oos_global_pages;
 	struct dmar_domain *intel_iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
@@ -385,6 +389,7 @@ struct kvm_vm_stat {
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
 	u32 mmu_unsync;
+	u32 mmu_unsync_global;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
@@ -603,6 +608,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 58c35dead321..cbac9e4b156f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -793,9 +793,11 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&sp->oos_link);
 	ASSERT(is_empty_shadow_page(sp->spt));
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	sp->multimapped = 0;
+	sp->global = 1;
 	sp->parent_pte = parent_pte;
 	--vcpu->kvm->arch.n_free_mmu_pages;
 	return sp;
@@ -1066,10 +1068,18 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	return NULL;
 }
 
+static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+	list_del(&sp->oos_link);
+	--kvm->stat.mmu_unsync_global;
+}
+
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
 	sp->unsync = 0;
+	if (sp->global)
+		kvm_unlink_unsync_global(kvm, sp);
 	--kvm->stat.mmu_unsync;
 }
 
@@ -1615,9 +1625,15 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		if (s->role.word != sp->role.word)
 			return 1;
 	}
-	kvm_mmu_mark_parents_unsync(vcpu, sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
+
+	if (sp->global) {
+		list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
+		++vcpu->kvm->stat.mmu_unsync_global;
+	} else
+		kvm_mmu_mark_parents_unsync(vcpu, sp);
+
 	mmu_convert_notrap(sp);
 	return 0;
 }
@@ -1643,12 +1659,21 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    gfn_t gfn, pfn_t pfn, bool speculative,
+		    int global, gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
 	u64 mt_mask = shadow_mt_mask;
+	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
+
+	if (!global && sp->global) {
+		sp->global = 0;
+		if (sp->unsync) {
+			kvm_unlink_unsync_global(vcpu->kvm, sp);
+			kvm_mmu_mark_parents_unsync(vcpu, sp);
+		}
+	}
 
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
@@ -1717,8 +1742,8 @@ set_pte:
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, gfn_t gfn,
-			 pfn_t pfn, bool speculative)
+			 int *ptwrite, int largepage, int global,
+			 gfn_t gfn, pfn_t pfn, bool speculative)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1751,7 +1776,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		}
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, gfn, pfn, speculative, true)) {
+		      dirty, largepage, global, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1808,7 +1833,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk,
 	    || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
 		mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
 			     0, walk->write, 1, &walk->pt_write,
-			     walk->largepage, gfn, walk->pfn, false);
+			     walk->largepage, 0, gfn, walk->pfn, false);
 		++vcpu->stat.pf_fixed;
 		return 1;
 	}
@@ -1995,6 +2020,15 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	}
 }
 
+static void mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_mmu_page *sp, *n;
+
+	list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
+		kvm_sync_page(vcpu, sp);
+}
+
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -2002,6 +2036,13 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
+void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
+{
+	spin_lock(&vcpu->kvm->mmu_lock);
+	mmu_sync_global(vcpu);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+}
+
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 84eee43bbe74..e644d81979b6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -274,7 +274,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 		return;
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
+		     gpte & PT_DIRTY_MASK, NULL, largepage,
+		     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
 		     pfn, true);
 }
 
@@ -301,8 +302,9 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
 		mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
 			     sw->user_fault, sw->write_fault,
 			     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-			     sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
-			     false);
+			     sw->ptwrite, sw->largepage,
+			     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
+			     gw->gfn, sw->pfn, false);
 		sw->sptep = sptep;
 		return 1;
 	}
@@ -580,7 +582,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_pte(gpte), 0, gfn,
+			 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7a2aeba0bfbd..774db00d2db6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -104,6 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
+	{ "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -315,6 +316,7 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	vcpu->arch.cr0 = cr0;
 
+	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 	return;
 }
@@ -358,6 +360,7 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
+	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -4113,6 +4116,7 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
+	INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */

From ad218f85e388e8ca816ff09d91c246cd014c53a8 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 1 Dec 2008 22:32:05 -0200
Subject: [PATCH 123/140] KVM: MMU: prepopulate the shadow on invlpg

If the guest executes invlpg, peek into the pagetable and attempt to
prepopulate the shadow entry.

Also stop dirty fault updates from interfering with the fork detector.

2% improvement on RHEL3/AIM7.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ++-
 arch/x86/kvm/mmu.c              | 25 +++++++++++++------------
 arch/x86/kvm/paging_tmpl.h      | 25 ++++++++++++++++++++++++-
 arch/x86/kvm/x86.c              |  2 +-
 4 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 65b1ed295698..97215a458e5f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -602,7 +602,8 @@ unsigned long segment_base(u16 selector);
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes);
+		       const u8 *new, int bytes,
+		       bool guest_initiated);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cbac9e4b156f..863baf70506e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2441,7 +2441,8 @@ static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes)
+		       const u8 *new, int bytes,
+		       bool guest_initiated)
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	struct kvm_mmu_page *sp;
@@ -2467,15 +2468,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	kvm_mmu_free_some_pages(vcpu);
 	++vcpu->kvm->stat.mmu_pte_write;
 	kvm_mmu_audit(vcpu, "pre pte write");
-	if (gfn == vcpu->arch.last_pt_write_gfn
-	    && !last_updated_pte_accessed(vcpu)) {
-		++vcpu->arch.last_pt_write_count;
-		if (vcpu->arch.last_pt_write_count >= 3)
-			flooded = 1;
-	} else {
-		vcpu->arch.last_pt_write_gfn = gfn;
-		vcpu->arch.last_pt_write_count = 1;
-		vcpu->arch.last_pte_updated = NULL;
+	if (guest_initiated) {
+		if (gfn == vcpu->arch.last_pt_write_gfn
+		    && !last_updated_pte_accessed(vcpu)) {
+			++vcpu->arch.last_pt_write_count;
+			if (vcpu->arch.last_pt_write_count >= 3)
+				flooded = 1;
+		} else {
+			vcpu->arch.last_pt_write_gfn = gfn;
+			vcpu->arch.last_pt_write_count = 1;
+			vcpu->arch.last_pte_updated = NULL;
+		}
 	}
 	index = kvm_page_table_hashfn(gfn);
 	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
@@ -2615,9 +2618,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 {
-	spin_lock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.invlpg(vcpu, gva);
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_flush_tlb(vcpu);
 	++vcpu->stat.invlpg;
 }
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e644d81979b6..d20640154216 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -82,6 +82,7 @@ struct shadow_walker {
 	int *ptwrite;
 	pfn_t pfn;
 	u64 *sptep;
+	gpa_t pte_gpa;
 };
 
 static gfn_t gpte_to_gfn(pt_element_t gpte)
@@ -222,7 +223,7 @@ walk:
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
+		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
@@ -468,8 +469,15 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 				      struct kvm_vcpu *vcpu, u64 addr,
 				      u64 *sptep, int level)
 {
+	struct shadow_walker *sw =
+		container_of(_sw, struct shadow_walker, walker);
 
 	if (level == PT_PAGE_TABLE_LEVEL) {
+		struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
+		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
+
 		if (is_shadow_present_pte(*sptep))
 			rmap_remove(vcpu->kvm, sptep);
 		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
@@ -482,11 +490,26 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
+	pt_element_t gpte;
 	struct shadow_walker walker = {
 		.walker = { .entry = FNAME(shadow_invlpg_entry), },
+		.pte_gpa = -1,
 	};
 
+	spin_lock(&vcpu->kvm->mmu_lock);
 	walk_shadow(&walker.walker, vcpu, gva);
+	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (walker.pte_gpa == -1)
+		return;
+	if (kvm_read_guest_atomic(vcpu->kvm, walker.pte_gpa, &gpte,
+				  sizeof(pt_element_t)))
+		return;
+	if (is_present_pte(gpte) && (gpte & PT_ACCESSED_MASK)) {
+		if (mmu_topup_memory_caches(vcpu))
+			return;
+		kvm_mmu_pte_write(vcpu, walker.pte_gpa, (const u8 *)&gpte,
+				  sizeof(pt_element_t), 0);
+	}
 }
 
 static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 774db00d2db6..ba102879de33 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2046,7 +2046,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 	ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes);
 	if (ret < 0)
 		return 0;
-	kvm_mmu_pte_write(vcpu, gpa, val, bytes);
+	kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1);
 	return 1;
 }
 

From e93353c93a3ba4215633ce930784f40a4e94e3f9 Mon Sep 17 00:00:00 2001
From: Eduardo Habkost <ehabkost@redhat.com>
Date: Fri, 5 Dec 2008 18:36:45 -0200
Subject: [PATCH 124/140] x86: KVM guest: kvm_get_tsc_khz: return khz, not lpj

kvm_get_tsc_khz() currently returns the previously-calculated preset_lpj
value, but it is in loops-per-jiffy, not kHz. The current code works
correctly only when HZ=1000.

Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvmclock.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index b38e801014e3..652fce6d2cce 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -89,17 +89,17 @@ static cycle_t kvm_clock_read(void)
  */
 static unsigned long kvm_get_tsc_khz(void)
 {
-	return preset_lpj;
+	struct pvclock_vcpu_time_info *src;
+	src = &per_cpu(hv_clock, 0);
+	return pvclock_tsc_khz(src);
 }
 
 static void kvm_get_preset_lpj(void)
 {
-	struct pvclock_vcpu_time_info *src;
 	unsigned long khz;
 	u64 lpj;
 
-	src = &per_cpu(hv_clock, 0);
-	khz = pvclock_tsc_khz(src);
+	khz = kvm_get_tsc_khz();
 
 	lpj = ((u64)khz * 1000);
 	do_div(lpj, HZ);

From e3a2a0d4e5ace731e60e2eff4fb7056ecb34adc1 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 2 Dec 2008 11:16:03 +0100
Subject: [PATCH 125/140] anon_inodes: use fops->owner for module refcount

There is an imbalance for anonymous inodes. If the fops->owner field is set,
the module reference count of owner is decreases on release.
("filp_close" --> "__fput" ---> "fops_put")

On the other hand, anon_inode_getfd does not increase the module reference
count of owner. This causes two problems:

- if owner is set, the module refcount goes negative
- if owner is not set, the module can be unloaded while code is running

This patch changes anon_inode_getfd to be symmetric regarding fops->owner
handling.

I have checked all existing users of anon_inode_getfd. Noone sets fops->owner,
thats why nobody has seen the module refcount negative. The refcounting was
tested with a patched and unpatched KVM module.(see patch 2/2) I also did an
epoll_open/close test.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Reviewed-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 fs/anon_inodes.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
 	if (IS_ERR(anon_inode_inode))
 		return -ENODEV;
 
+	if (fops->owner && !try_module_get(fops->owner))
+		return -ENOENT;
+
 	error = get_unused_fd_flags(flags);
 	if (error < 0)
-		return error;
+		goto err_module;
 	fd = error;
 
 	/*
@@ -128,6 +131,8 @@ err_dput:
 	dput(dentry);
 err_put_unused_fd:
 	put_unused_fd(fd);
+err_module:
+	module_put(fops->owner);
 	return error;
 }
 EXPORT_SYMBOL_GPL(anon_inode_getfd);

From 3d3aab1b973b01bd2a1aa46307e94a1380b1d802 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 2 Dec 2008 11:17:32 +0100
Subject: [PATCH 126/140] KVM: set owner of cpu and vm file operations

There is a race between a "close of the file descriptors" and module
unload in the kvm module.

You can easily trigger this problem by applying this debug patch:
>--- kvm.orig/virt/kvm/kvm_main.c
>+++ kvm/virt/kvm/kvm_main.c
>@@ -648,10 +648,14 @@ void kvm_free_physmem(struct kvm *kvm)
>                kvm_free_physmem_slot(&kvm->memslots[i], NULL);
> }
>
>+#include <linux/delay.h>
> static void kvm_destroy_vm(struct kvm *kvm)
> {
>        struct mm_struct *mm = kvm->mm;
>
>+       printk("off1\n");
>+       msleep(5000);
>+       printk("off2\n");
>        spin_lock(&kvm_lock);
>        list_del(&kvm->vm_list);
>        spin_unlock(&kvm_lock);

and killing the userspace, followed by an rmmod.

The problem is that kvm_destroy_vm can run while the module count
is 0. That means, you can remove the module while kvm_destroy_vm
is running. But kvm_destroy_vm is part of the module text. This
causes a kerneloops. The race exists without the msleep but is much
harder to trigger.

This patch requires the fix for anon_inodes (anon_inodes: use fops->owner
for module refcount).
With this patch, we can set the owner of all anonymous KVM inodes file
operations. The VFS will then control the KVM module refcount as long as there
is an open file. kvm_destroy_vm will be called by the release function of the
last closed file - before the VFS drops the module refcount.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fd9cc79092cb..484c903544f4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1498,7 +1498,7 @@ static int kvm_vcpu_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static const struct file_operations kvm_vcpu_fops = {
+static struct file_operations kvm_vcpu_fops = {
 	.release        = kvm_vcpu_release,
 	.unlocked_ioctl = kvm_vcpu_ioctl,
 	.compat_ioctl   = kvm_vcpu_ioctl,
@@ -1892,7 +1892,7 @@ static int kvm_vm_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
-static const struct file_operations kvm_vm_fops = {
+static struct file_operations kvm_vm_fops = {
 	.release        = kvm_vm_release,
 	.unlocked_ioctl = kvm_vm_ioctl,
 	.compat_ioctl   = kvm_vm_ioctl,
@@ -2256,6 +2256,8 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 	}
 
 	kvm_chardev_ops.owner = module;
+	kvm_vm_fops.owner = module;
+	kvm_vcpu_fops.owner = module;
 
 	r = misc_register(&kvm_dev);
 	if (r) {

From 498468961ed6f62a306eb90c49125776c526fa40 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Dec 2008 20:26:24 +1030
Subject: [PATCH 127/140] KVM: Extract core of
 kvm_flush_remote_tlbs/kvm_reload_remote_mmus

Avi said:
> Wow, code duplication from Rusty. Things must be bad.

Something about glass houses comes to mind.  But instead, a patch.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 44 +++++++++++++++-----------------------------
 1 file changed, 15 insertions(+), 29 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 484c903544f4..ba4275d06fb7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -552,10 +552,11 @@ static void ack_flush(void *_completed)
 {
 }
 
-void kvm_flush_remote_tlbs(struct kvm *kvm)
+static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
 	int i, cpu, me;
 	cpumask_t cpus;
+	bool called = false;
 	struct kvm_vcpu *vcpu;
 
 	me = get_cpu();
@@ -564,46 +565,31 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 		vcpu = kvm->vcpus[i];
 		if (!vcpu)
 			continue;
-		if (test_and_set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+		if (test_and_set_bit(req, &vcpu->requests))
 			continue;
 		cpu = vcpu->cpu;
 		if (cpu != -1 && cpu != me)
 			cpu_set(cpu, cpus);
 	}
-	if (cpus_empty(cpus))
-		goto out;
-	++kvm->stat.remote_tlb_flush;
-	smp_call_function_mask(cpus, ack_flush, NULL, 1);
-out:
+	if (!cpus_empty(cpus)) {
+		smp_call_function_mask(cpus, ack_flush, NULL, 1);
+		called = true;
+	}
 	put_cpu();
+	return called;
+}
+
+void kvm_flush_remote_tlbs(struct kvm *kvm)
+{
+	if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
+		++kvm->stat.remote_tlb_flush;
 }
 
 void kvm_reload_remote_mmus(struct kvm *kvm)
 {
-	int i, cpu, me;
-	cpumask_t cpus;
-	struct kvm_vcpu *vcpu;
-
-	me = get_cpu();
-	cpus_clear(cpus);
-	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-		vcpu = kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-		if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
-			continue;
-		cpu = vcpu->cpu;
-		if (cpu != -1 && cpu != me)
-			cpu_set(cpu, cpus);
-	}
-	if (cpus_empty(cpus))
-		goto out;
-	smp_call_function_mask(cpus, ack_flush, NULL, 1);
-out:
-	put_cpu();
+	make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
 }
 
-
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 {
 	struct page *page;

From 6ef7a1bc45f80fe0a263119d404688c596ea5031 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 8 Dec 2008 20:28:04 +1030
Subject: [PATCH 128/140] KVM: use modern cpumask primitives, no cpumask_t on
 stack

We're getting rid on on-stack cpumasks for large NR_CPUS.

1) Use cpumask_var_t/alloc_cpumask_var.
2) smp_call_function_mask -> smp_call_function_many
3) cpus_clear, cpus_empty, cpu_set -> cpumask_clear, cpumask_empty,
   cpumask_set_cpu.

This actually generates slightly smaller code than the old one with
CONFIG_CPUMASKS_OFFSTACK=n.  (gcc knows that cpus cannot be NULL in
that case, where cpumask_var_t is cpumask_t[1]).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ba4275d06fb7..2d6ca7968d65 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -555,12 +555,14 @@ static void ack_flush(void *_completed)
 static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 {
 	int i, cpu, me;
-	cpumask_t cpus;
-	bool called = false;
+	cpumask_var_t cpus;
+	bool called = true;
 	struct kvm_vcpu *vcpu;
 
+	if (alloc_cpumask_var(&cpus, GFP_ATOMIC))
+		cpumask_clear(cpus);
+
 	me = get_cpu();
-	cpus_clear(cpus);
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		vcpu = kvm->vcpus[i];
 		if (!vcpu)
@@ -568,14 +570,17 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 		if (test_and_set_bit(req, &vcpu->requests))
 			continue;
 		cpu = vcpu->cpu;
-		if (cpu != -1 && cpu != me)
-			cpu_set(cpu, cpus);
-	}
-	if (!cpus_empty(cpus)) {
-		smp_call_function_mask(cpus, ack_flush, NULL, 1);
-		called = true;
+		if (cpus != NULL && cpu != -1 && cpu != me)
+			cpumask_set_cpu(cpu, cpus);
 	}
+	if (unlikely(cpus == NULL))
+		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
+	else if (!cpumask_empty(cpus))
+		smp_call_function_many(cpus, ack_flush, NULL, 1);
+	else
+		called = false;
 	put_cpu();
+	free_cpumask_var(cpus);
 	return called;
 }
 

From 7f59f492da722eb3551bbe1f8f4450a21896f05d Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sun, 7 Dec 2008 21:25:45 +1030
Subject: [PATCH 129/140] KVM: use cpumask_var_t for cpus_hardware_enabled

This changes cpus_hardware_enabled from a cpumask_t to a cpumask_var_t:
equivalent for CONFIG_CPUMASKS_OFFSTACK=n, otherwise dynamically allocated.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2d6ca7968d65..e7644b90667e 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -70,7 +70,7 @@ module_param(msi2intx, bool, 0);
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
-static cpumask_t cpus_hardware_enabled;
+static cpumask_var_t cpus_hardware_enabled;
 
 struct kmem_cache *kvm_vcpu_cache;
 EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
@@ -1965,9 +1965,9 @@ static void hardware_enable(void *junk)
 {
 	int cpu = raw_smp_processor_id();
 
-	if (cpu_isset(cpu, cpus_hardware_enabled))
+	if (cpumask_test_cpu(cpu, cpus_hardware_enabled))
 		return;
-	cpu_set(cpu, cpus_hardware_enabled);
+	cpumask_set_cpu(cpu, cpus_hardware_enabled);
 	kvm_arch_hardware_enable(NULL);
 }
 
@@ -1975,9 +1975,9 @@ static void hardware_disable(void *junk)
 {
 	int cpu = raw_smp_processor_id();
 
-	if (!cpu_isset(cpu, cpus_hardware_enabled))
+	if (!cpumask_test_cpu(cpu, cpus_hardware_enabled))
 		return;
-	cpu_clear(cpu, cpus_hardware_enabled);
+	cpumask_clear_cpu(cpu, cpus_hardware_enabled);
 	kvm_arch_hardware_disable(NULL);
 }
 
@@ -2211,9 +2211,14 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
 	bad_pfn = page_to_pfn(bad_page);
 
+	if (!alloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
 	r = kvm_arch_hardware_setup();
 	if (r < 0)
-		goto out_free_0;
+		goto out_free_0a;
 
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu,
@@ -2277,6 +2282,8 @@ out_free_2:
 	on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
 	kvm_arch_hardware_unsetup();
+out_free_0a:
+	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
 	__free_page(bad_page);
 out:
@@ -2300,6 +2307,7 @@ void kvm_exit(void)
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	kvm_exit_debug();
+	free_cpumask_var(cpus_hardware_enabled);
 	__free_page(bad_page);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);

From 1a811b6167089bcdb84284f2dc9fd0b4d0f1899d Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Dec 2008 18:25:27 +0200
Subject: [PATCH 130/140] KVM: Advertise the bug in memory region destruction
 as fixed

Userspace might need to act differently.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h |  2 ++
 virt/kvm/kvm_main.c | 13 ++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 0997e6f5490c..48807767e726 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -395,6 +395,8 @@ struct kvm_trace_rec {
 #if defined(CONFIG_X86)
 #define KVM_CAP_DEVICE_MSI 20
 #endif
+/* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
+#define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e7644b90667e..e066eb125e55 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1905,6 +1905,17 @@ static int kvm_dev_ioctl_create_vm(void)
 	return fd;
 }
 
+static long kvm_dev_ioctl_check_extension_generic(long arg)
+{
+	switch (arg) {
+	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
+		return 1;
+	default:
+		break;
+	}
+	return kvm_dev_ioctl_check_extension(arg);
+}
+
 static long kvm_dev_ioctl(struct file *filp,
 			  unsigned int ioctl, unsigned long arg)
 {
@@ -1924,7 +1935,7 @@ static long kvm_dev_ioctl(struct file *filp,
 		r = kvm_dev_ioctl_create_vm();
 		break;
 	case KVM_CHECK_EXTENSION:
-		r = kvm_dev_ioctl_check_extension(arg);
+		r = kvm_dev_ioctl_check_extension_generic(arg);
 		break;
 	case KVM_GET_VCPU_MMAP_SIZE:
 		r = -EINVAL;

From ca9edaee1aea34ebd9adb48910aba0b3d64b1b22 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Dec 2008 18:29:29 +0200
Subject: [PATCH 131/140] KVM: Consolidate userspace memory capability
 reporting into common code

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c   | 1 -
 arch/powerpc/kvm/powerpc.c | 3 ---
 arch/s390/kvm/kvm-s390.c   | 2 --
 arch/x86/kvm/x86.c         | 1 -
 virt/kvm/kvm_main.c        | 1 +
 5 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index b4d24e2cce40..d2eb9691d61d 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -180,7 +180,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 
 	switch (ext) {
 	case KVM_CAP_IRQCHIP:
-	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_MP_STATE:
 
 		r = 1;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 1deda37cb771..2822c8ccfaaf 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -137,9 +137,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	int r;
 
 	switch (ext) {
-	case KVM_CAP_USER_MEMORY:
-		r = 1;
-		break;
 	case KVM_CAP_COALESCED_MMIO:
 		r = KVM_COALESCED_MMIO_PAGE_OFFSET;
 		break;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 76f05ddaef10..be8497186b96 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -113,8 +113,6 @@ long kvm_arch_dev_ioctl(struct file *filp,
 int kvm_dev_ioctl_check_extension(long ext)
 {
 	switch (ext) {
-	case KVM_CAP_USER_MEMORY:
-		return 1;
 	default:
 		return 0;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ba102879de33..10302d3bd415 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -964,7 +964,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_IRQCHIP:
 	case KVM_CAP_HLT:
 	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
-	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_SET_TSS_ADDR:
 	case KVM_CAP_EXT_CPUID:
 	case KVM_CAP_CLOCKSOURCE:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e066eb125e55..eb70ca6c7145 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1908,6 +1908,7 @@ static int kvm_dev_ioctl_create_vm(void)
 static long kvm_dev_ioctl_check_extension_generic(long arg)
 {
 	switch (arg) {
+	case KVM_CAP_USER_MEMORY:
 	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
 		return 1;
 	default:

From eb64f1e8cd5c3cae912db30a77d062367f7a11a6 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 9 Dec 2008 16:07:22 +0100
Subject: [PATCH 132/140] KVM: MMU: check for present pdptr shadow page in
 walk_shadow

walk_shadow assumes the caller verified validity of the pdptr pointer in
question, which is not the case for the invlpg handler.

Fixes oops during Solaris 10 install.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 863baf70506e..641c07844e6e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1269,6 +1269,8 @@ static int walk_shadow(struct kvm_shadow_walk *walker,
 	if (level == PT32E_ROOT_LEVEL) {
 		shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
 		shadow_addr &= PT64_BASE_ADDR_MASK;
+		if (!shadow_addr)
+			return 1;
 		--level;
 	}
 

From defaf1587c5d7dff828f6f11c8941e5bcef00f50 Mon Sep 17 00:00:00 2001
From: Mark McLoughlin <markmc@redhat.com>
Date: Tue, 2 Dec 2008 12:16:33 +0000
Subject: [PATCH 133/140] KVM: fix handling of ACK from shared guest IRQ

If an assigned device shares a guest irq with an emulated
device then we currently interpret an ack generated by the
emulated device as originating from the assigned device
leading to e.g. "Unbalanced enable for IRQ 4347" from the
enable_irq() in kvm_assigned_dev_ack_irq().

The fix is fairly simple - don't enable the physical device
irq unless it was previously disabled.

Of course, this can still lead to a situation where a
non-assigned device ACK can cause the physical device irq to
be reenabled before the device was serviced. However, being
level sensitive, the interrupt will merely be regenerated.

Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8091a4d90ddf..eafabd5c66b2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -307,6 +307,7 @@ struct kvm_assigned_dev_kernel {
 	int host_busnr;
 	int host_devfn;
 	int host_irq;
+	bool host_irq_disabled;
 	int guest_irq;
 	struct msi_msg guest_msi;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index eb70ca6c7145..fc6127cbea1f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -170,6 +170,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 				KVM_ASSIGNED_DEV_GUEST_MSI) {
 		assigned_device_msi_dispatch(assigned_dev);
 		enable_irq(assigned_dev->host_irq);
+		assigned_dev->host_irq_disabled = false;
 	}
 	mutex_unlock(&assigned_dev->kvm->lock);
 	kvm_put_kvm(assigned_dev->kvm);
@@ -181,8 +182,12 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
 	kvm_get_kvm(assigned_dev->kvm);
+
 	schedule_work(&assigned_dev->interrupt_work);
+
 	disable_irq_nosync(irq);
+	assigned_dev->host_irq_disabled = true;
+
 	return IRQ_HANDLED;
 }
 
@@ -196,8 +201,16 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 	dev = container_of(kian, struct kvm_assigned_dev_kernel,
 			   ack_notifier);
+
 	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
-	enable_irq(dev->host_irq);
+
+	/* The guest irq may be shared so this ack may be
+	 * from another device.
+	 */
+	if (dev->host_irq_disabled) {
+		enable_irq(dev->host_irq);
+		dev->host_irq_disabled = false;
+	}
 }
 
 static void kvm_free_assigned_irq(struct kvm *kvm,

From 264ff01d55b456932cef03082448b41d2edeb6a1 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 24 Nov 2008 12:26:19 +0100
Subject: [PATCH 134/140] KVM: VMX: Fix pending NMI-vs.-IRQ race for user space
 irqchip

As with the kernel irqchip, don't allow an NMI to stomp over an already
injected IRQ; instead wait for the IRQ injection to be completed.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e446f232588e..487e1dcdce33 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2486,7 +2486,9 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	vmx_update_window_states(vcpu);
 
 	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-		if (vcpu->arch.nmi_window_open) {
+		if (vcpu->arch.interrupt.pending) {
+			enable_nmi_window(vcpu);
+		} else if (vcpu->arch.nmi_window_open) {
 			vcpu->arch.nmi_pending = false;
 			vcpu->arch.nmi_injected = true;
 		} else {

From 4531220b71f0399e71cda0c4cf749e7281a7416a Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Thu, 11 Dec 2008 16:54:54 +0100
Subject: [PATCH 135/140] KVM: x86: Rework user space NMI injection as
 KVM_CAP_USER_NMI

There is no point in doing the ready_for_nmi_injection/
request_nmi_window dance with user space. First, we don't do this for
in-kernel irqchip anyway, while the code path is the same as for user
space irqchip mode. And second, there is nothing to loose if a pending
NMI is overwritten by another one (in contrast to IRQs where we have to
save the number). Actually, there is even the risk of raising spurious
NMIs this way because the reason for the held-back NMI might already be
handled while processing the first one.

Therefore this patch creates a simplified user space NMI injection
interface, exporting it under KVM_CAP_USER_NMI and dropping the old
KVM_CAP_NMI capability. And this time we also take care to provide the
interface only on archs supporting NMIs via KVM (right now only x86).

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c  | 24 ++----------------------
 arch/x86/kvm/x86.c  | 28 ++--------------------------
 include/linux/kvm.h | 11 +++++------
 3 files changed, 9 insertions(+), 54 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 487e1dcdce33..6259d7467648 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2498,15 +2498,13 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	}
 	if (vcpu->arch.nmi_injected) {
 		vmx_inject_nmi(vcpu);
-		if (vcpu->arch.nmi_pending || kvm_run->request_nmi_window)
+		if (vcpu->arch.nmi_pending)
 			enable_nmi_window(vcpu);
 		else if (vcpu->arch.irq_summary
 			 || kvm_run->request_interrupt_window)
 			enable_irq_window(vcpu);
 		return;
 	}
-	if (!vcpu->arch.nmi_window_open || kvm_run->request_nmi_window)
-		enable_nmi_window(vcpu);
 
 	if (vcpu->arch.interrupt_window_open) {
 		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
@@ -3040,14 +3038,6 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 	++vcpu->stat.nmi_window_exits;
 
-	/*
-	 * If the user space waits to inject a NMI, exit as soon as possible
-	 */
-	if (kvm_run->request_nmi_window && !vcpu->arch.nmi_pending) {
-		kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
-		return 0;
-	}
-
 	return 1;
 }
 
@@ -3162,7 +3152,7 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			vmx->soft_vnmi_blocked = 0;
 			vcpu->arch.nmi_window_open = 1;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
-		    (kvm_run->request_nmi_window || vcpu->arch.nmi_pending)) {
+			   vcpu->arch.nmi_pending) {
 			/*
 			 * This CPU don't support us in finding the end of an
 			 * NMI-blocked window if the guest runs with IRQs
@@ -3175,16 +3165,6 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			vmx->soft_vnmi_blocked = 0;
 			vmx->vcpu.arch.nmi_window_open = 1;
 		}
-
-		/*
-		 * If the user space waits to inject an NNI, exit ASAP
-		 */
-		if (vcpu->arch.nmi_window_open && kvm_run->request_nmi_window
-		    && !vcpu->arch.nmi_pending) {
-			kvm_run->exit_reason = KVM_EXIT_NMI_WINDOW_OPEN;
-			++vcpu->stat.nmi_window_exits;
-			return 0;
-		}
 	}
 
 	if (exit_reason < kvm_vmx_max_exit_handlers
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 10302d3bd415..0e6aa8141dcd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2887,37 +2887,18 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
 }
 
-/*
- * Check if userspace requested a NMI window, and that the NMI window
- * is open.
- *
- * No need to exit to userspace if we already have a NMI queued.
- */
-static int dm_request_for_nmi_injection(struct kvm_vcpu *vcpu,
-					struct kvm_run *kvm_run)
-{
-	return (!vcpu->arch.nmi_pending &&
-		kvm_run->request_nmi_window &&
-		vcpu->arch.nmi_window_open);
-}
-
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 			      struct kvm_run *kvm_run)
 {
 	kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection = 1;
-		kvm_run->ready_for_nmi_injection = 1;
-	} else {
+	else
 		kvm_run->ready_for_interrupt_injection =
 					(vcpu->arch.interrupt_window_open &&
 					 vcpu->arch.irq_summary == 0);
-		kvm_run->ready_for_nmi_injection =
-					(vcpu->arch.nmi_window_open &&
-					 vcpu->arch.nmi_pending == 0);
-	}
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
@@ -3093,11 +3074,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 
 		if (r > 0) {
-			if (dm_request_for_nmi_injection(vcpu, kvm_run)) {
-				r = -EINTR;
-				kvm_run->exit_reason = KVM_EXIT_NMI;
-				++vcpu->stat.request_nmi_exits;
-			}
 			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
 				r = -EINTR;
 				kvm_run->exit_reason = KVM_EXIT_INTR;
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 48807767e726..35525ac63337 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -84,21 +84,18 @@ struct kvm_irqchip {
 #define KVM_EXIT_S390_RESET       14
 #define KVM_EXIT_DCR              15
 #define KVM_EXIT_NMI              16
-#define KVM_EXIT_NMI_WINDOW_OPEN  17
 
 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
 struct kvm_run {
 	/* in */
 	__u8 request_interrupt_window;
-	__u8 request_nmi_window;
-	__u8 padding1[6];
+	__u8 padding1[7];
 
 	/* out */
 	__u32 exit_reason;
 	__u8 ready_for_interrupt_injection;
 	__u8 if_flag;
-	__u8 ready_for_nmi_injection;
-	__u8 padding2;
+	__u8 padding2[2];
 
 	/* in (pre_kvm_run), out (post_kvm_run) */
 	__u64 cr8;
@@ -391,12 +388,14 @@ struct kvm_trace_rec {
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
 #endif
 #define KVM_CAP_IOMMU 18
-#define KVM_CAP_NMI 19
 #if defined(CONFIG_X86)
 #define KVM_CAP_DEVICE_MSI 20
 #endif
 /* Bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_DESTROY_MEMORY_REGION_WORKS 21
+#if defined(CONFIG_X86)
+#define KVM_CAP_USER_NMI 22
+#endif
 
 /*
  * ioctls for VM fds

From 042b26edf0bc1b0f03238a71aed71cca4593848c Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Tue, 16 Dec 2008 16:45:47 +0100
Subject: [PATCH 136/140] KVM: ia64: Fix kvm_arch_vcpu_ioctl_[gs]et_regs()

Fix kvm_arch_vcpu_ioctl_[gs]et_regs() to do something meaningful on
ia64. Old versions could never have worked since they required
pointers to be set in the ioctl payload which were never being set by
the ioctl handler for get_regs.

In addition reserve extra space for future extensions.

The change of layout of struct kvm_regs doesn't require adding a new
CAP since get/set regs never worked on ia64 until now.

This version doesn't support copying the KVM kernel stack in/out of
the kernel. This should be implemented in a seperate ioctl call if
ever needed.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm.h |  6 ++++--
 arch/ia64/kvm/kvm-ia64.c    | 40 ++++++++++---------------------------
 2 files changed, 14 insertions(+), 32 deletions(-)

diff --git a/arch/ia64/include/asm/kvm.h b/arch/ia64/include/asm/kvm.h
index f38472ac2267..68aa6da807c1 100644
--- a/arch/ia64/include/asm/kvm.h
+++ b/arch/ia64/include/asm/kvm.h
@@ -166,8 +166,6 @@ struct saved_vpd {
 };
 
 struct kvm_regs {
-	char *saved_guest;
-	char *saved_stack;
 	struct saved_vpd vpd;
 	/*Arch-regs*/
 	int mp_state;
@@ -200,6 +198,10 @@ struct kvm_regs {
 	unsigned long fp_psr;       /*used for lazy float register */
 	unsigned long saved_gp;
 	/*for phycial  emulation */
+
+	union context saved_guest;
+
+	unsigned long reserved[64];	/* for future use */
 };
 
 struct kvm_sregs {
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d2eb9691d61d..0f5ebd948437 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -831,9 +831,8 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	int i;
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
-	int r;
+	int i;
 
 	vcpu_load(vcpu);
 
@@ -850,18 +849,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 	vpd->vpr = regs->vpd.vpr;
 
-	r = -EFAULT;
-	r = copy_from_user(&vcpu->arch.guest, regs->saved_guest,
-						sizeof(union context));
-	if (r)
-		goto out;
-	r = copy_from_user(vcpu + 1, regs->saved_stack +
-			sizeof(struct kvm_vcpu),
-			KVM_STK_OFFSET - sizeof(struct kvm_vcpu));
-	if (r)
-		goto out;
-	vcpu->arch.exit_data =
-		((struct kvm_vcpu *)(regs->saved_stack))->arch.exit_data;
+	memcpy(&vcpu->arch.guest, &regs->saved_guest, sizeof(union context));
 
 	RESTORE_REGS(mp_state);
 	RESTORE_REGS(vmm_rr);
@@ -895,9 +883,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
 	vcpu_put(vcpu);
-	r = 0;
-out:
-	return r;
+
+	return 0;
 }
 
 long kvm_arch_vm_ioctl(struct file *filp,
@@ -1378,9 +1365,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
-	int i;
-	int r;
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+	int i;
+
 	vcpu_load(vcpu);
 
 	for (i = 0; i < 16; i++) {
@@ -1395,14 +1382,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->vpd.vpsr = vpd->vpsr;
 	regs->vpd.vpr = vpd->vpr;
 
-	r = -EFAULT;
-	r = copy_to_user(regs->saved_guest, &vcpu->arch.guest,
-					sizeof(union context));
-	if (r)
-		goto out;
-	r = copy_to_user(regs->saved_stack, (void *)vcpu, KVM_STK_OFFSET);
-	if (r)
-		goto out;
+	memcpy(&regs->saved_guest, &vcpu->arch.guest, sizeof(union context));
+
 	SAVE_REGS(mp_state);
 	SAVE_REGS(vmm_rr);
 	memcpy(regs->itrs, vcpu->arch.itrs, sizeof(struct thash_data) * NITRS);
@@ -1430,10 +1411,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	SAVE_REGS(metaphysical_saved_rr4);
 	SAVE_REGS(fp_psr);
 	SAVE_REGS(saved_gp);
+
 	vcpu_put(vcpu);
-	r = 0;
-out:
-	return r;
+	return 0;
 }
 
 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)

From fe634fd46ff643d98fdbcd153847e08c3c076e6e Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Wed, 17 Dec 2008 09:38:14 +0800
Subject: [PATCH 137/140] MAINTAINERS: Maintainership changes for kvm/ia64

Anthony Xu no longer works on kvm.

Cc:  "Luck, Tony"  <tony.luck@intel.com>
Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 MAINTAINERS | 2 --
 1 file changed, 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ceb32ee51f9d..4209b7148758 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2541,8 +2541,6 @@ W:	http://kvm.qumranet.com
 S:	Supported
 
 KERNEL VIRTUAL MACHINE For Itanium (KVM/IA64)
-P:	Anthony Xu
-M:	anthony.xu@intel.com
 P:	Xiantao Zhang
 M:	xiantao.zhang@intel.com
 L:	kvm-ia64@vger.kernel.org

From 25e2343246fe135fce672f41abe61e9d2c38caac Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 21 Dec 2008 18:31:10 +0200
Subject: [PATCH 138/140] KVM: MMU: Don't treat a global pte as such if cr4.pge
 is cleared

The pte.g bit is meaningless if global pages are disabled; deferring
mmu page synchronization on these ptes will lead to the guest using stale
shadow ptes.

Fixes Vista x86 smp bootloader failure.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 641c07844e6e..d50ebac6a07f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1669,6 +1669,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 	u64 mt_mask = shadow_mt_mask;
 	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
 
+	if (!(vcpu->arch.cr4 & X86_CR4_PGE))
+		global = 0;
 	if (!global && sp->global) {
 		sp->global = 0;
 		if (sp->unsync) {

From 3f353858c98dbe0240dac558a89870f4600f81bb Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@qumranet.com>
Date: Sun, 21 Dec 2008 22:48:32 +0200
Subject: [PATCH 139/140] KVM: Add locking to virtual i8259 interrupt
 controller

While most accesses to the i8259 are with the kvm mutex taken, the call
to kvm_pic_read_irq() is not.  We can't easily take the kvm mutex there
since the function is called with interrupts disabled.

Fix by adding a spinlock to the virtual interrupt controller.  Since we
can't send an IPI under the spinlock (we also take the same spinlock in
an irq disabled context), we defer the IPI until the spinlock is released.
Similarly, we defer irq ack notifications until after spinlock release to
avoid lock recursion.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8259.c | 52 ++++++++++++++++++++++++++++++++++++++++----
 arch/x86/kvm/irq.h   |  5 +++++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 17e41e165f1a..179dcb0103fd 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,10 +26,40 @@
  *   Port from Qemu.
  */
 #include <linux/mm.h>
+#include <linux/bitops.h>
 #include "irq.h"
 
 #include <linux/kvm_host.h>
 
+static void pic_lock(struct kvm_pic *s)
+{
+	spin_lock(&s->lock);
+}
+
+static void pic_unlock(struct kvm_pic *s)
+{
+	struct kvm *kvm = s->kvm;
+	unsigned acks = s->pending_acks;
+	bool wakeup = s->wakeup_needed;
+	struct kvm_vcpu *vcpu;
+
+	s->pending_acks = 0;
+	s->wakeup_needed = false;
+
+	spin_unlock(&s->lock);
+
+	while (acks) {
+		kvm_notify_acked_irq(kvm, __ffs(acks));
+		acks &= acks - 1;
+	}
+
+	if (wakeup) {
+		vcpu = s->kvm->vcpus[0];
+		if (vcpu)
+			kvm_vcpu_kick(vcpu);
+	}
+}
+
 static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
 {
 	s->isr &= ~(1 << irq);
@@ -136,17 +166,21 @@ static void pic_update_irq(struct kvm_pic *s)
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 {
+	pic_lock(s);
 	pic_update_irq(s);
+	pic_unlock(s);
 }
 
 void kvm_pic_set_irq(void *opaque, int irq, int level)
 {
 	struct kvm_pic *s = opaque;
 
+	pic_lock(s);
 	if (irq >= 0 && irq < PIC_NUM_PINS) {
 		pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
 		pic_update_irq(s);
 	}
+	pic_unlock(s);
 }
 
 /*
@@ -172,6 +206,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 	int irq, irq2, intno;
 	struct kvm_pic *s = pic_irqchip(kvm);
 
+	pic_lock(s);
 	irq = pic_get_irq(&s->pics[0]);
 	if (irq >= 0) {
 		pic_intack(&s->pics[0], irq);
@@ -196,6 +231,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 		intno = s->pics[0].irq_base + irq;
 	}
 	pic_update_irq(s);
+	pic_unlock(s);
 	kvm_notify_acked_irq(kvm, irq);
 
 	return intno;
@@ -203,7 +239,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
-	int irq, irqbase;
+	int irq, irqbase, n;
 	struct kvm *kvm = s->pics_state->irq_request_opaque;
 	struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
 
@@ -214,8 +250,10 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
 
 	for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
 		if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
-			if (s->irr & (1 << irq) || s->isr & (1 << irq))
-				kvm_notify_acked_irq(kvm, irq+irqbase);
+			if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
+				n = irq + irqbase;
+				s->pics_state->pending_acks |= 1 << n;
+			}
 	}
 	s->last_irr = 0;
 	s->irr = 0;
@@ -406,6 +444,7 @@ static void picdev_write(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte write\n");
 		return;
 	}
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -418,6 +457,7 @@ static void picdev_write(struct kvm_io_device *this,
 		elcr_ioport_write(&s->pics[addr & 1], addr, data);
 		break;
 	}
+	pic_unlock(s);
 }
 
 static void picdev_read(struct kvm_io_device *this,
@@ -431,6 +471,7 @@ static void picdev_read(struct kvm_io_device *this,
 			printk(KERN_ERR "PIC: non byte read\n");
 		return;
 	}
+	pic_lock(s);
 	switch (addr) {
 	case 0x20:
 	case 0x21:
@@ -444,6 +485,7 @@ static void picdev_read(struct kvm_io_device *this,
 		break;
 	}
 	*(unsigned char *)val = data;
+	pic_unlock(s);
 }
 
 /*
@@ -459,7 +501,7 @@ static void pic_irq_request(void *opaque, int level)
 	s->output = level;
 	if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
 		s->pics[0].isr_ack &= ~(1 << irq);
-		kvm_vcpu_kick(vcpu);
+		s->wakeup_needed = true;
 	}
 }
 
@@ -469,6 +511,8 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
 	if (!s)
 		return NULL;
+	spin_lock_init(&s->lock);
+	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
 	s->irq_request = pic_irq_request;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index b9e9051650ea..2bf32a03ceec 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -25,6 +25,7 @@
 #include <linux/mm_types.h>
 #include <linux/hrtimer.h>
 #include <linux/kvm_host.h>
+#include <linux/spinlock.h>
 
 #include "iodev.h"
 #include "ioapic.h"
@@ -59,6 +60,10 @@ struct kvm_kpic_state {
 };
 
 struct kvm_pic {
+	spinlock_t lock;
+	bool wakeup_needed;
+	unsigned pending_acks;
+	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
 	irq_request_func *irq_request;
 	void *irq_request_opaque;

From 87917239204d67a316cb89751750f86c9ed3640b Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 22 Dec 2008 18:49:30 -0200
Subject: [PATCH 140/140] KVM: MMU: handle large host sptes on invlpg/resync

The invlpg and sync walkers lack knowledge of large host sptes,
descending to non-existant pagetable level.

Stop at directory level in such case.

Fixes SMP Windows XP with hugepages.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c         | 2 +-
 arch/x86/kvm/paging_tmpl.h | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d50ebac6a07f..83f11c7474a1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1007,7 +1007,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
 		u64 ent = sp->spt[i];
 
-		if (is_shadow_present_pte(ent)) {
+		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
 			struct kvm_mmu_page *child;
 			child = page_header(ent & PT64_BASE_ADDR_MASK);
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d20640154216..9fd78b6e17ad 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -472,14 +472,19 @@ static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
 	struct shadow_walker *sw =
 		container_of(_sw, struct shadow_walker, walker);
 
-	if (level == PT_PAGE_TABLE_LEVEL) {
+	/* FIXME: properly handle invlpg on large guest pages */
+	if (level == PT_PAGE_TABLE_LEVEL ||
+	    ((level == PT_DIRECTORY_LEVEL) && is_large_pte(*sptep))) {
 		struct kvm_mmu_page *sp = page_header(__pa(sptep));
 
 		sw->pte_gpa = (sp->gfn << PAGE_SHIFT);
 		sw->pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
-		if (is_shadow_present_pte(*sptep))
+		if (is_shadow_present_pte(*sptep)) {
 			rmap_remove(vcpu->kvm, sptep);
+			if (is_large_pte(*sptep))
+				--vcpu->kvm->stat.lpages;
+		}
 		set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
 		return 1;
 	}