From 817fa998362d6ea9fabd5e97af8e9e2eb5f0e6f2 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 1 Jun 2023 18:01:37 -0700
Subject: [PATCH 1/5] KVM: x86/mmu: Grab memslot for correct address space in
 NX recovery worker

Factor in the address space (non-SMM vs. SMM) of the target shadow page
when recovering potential NX huge pages, otherwise KVM will retrieve the
wrong memslot when zapping shadow pages that were created for SMM.  The
bug most visibly manifests as a WARN on the memslot being non-NULL, but
the worst case scenario is that KVM could unaccount the shadow page
without ensuring KVM won't install a huge page, i.e. if the non-SMM slot
is being dirty logged, but the SMM slot is not.

 ------------[ cut here ]------------
 WARNING: CPU: 1 PID: 3911 at arch/x86/kvm/mmu/mmu.c:7015
 kvm_nx_huge_page_recovery_worker+0x38c/0x3d0 [kvm]
 CPU: 1 PID: 3911 Comm: kvm-nx-lpage-re
 RIP: 0010:kvm_nx_huge_page_recovery_worker+0x38c/0x3d0 [kvm]
 RSP: 0018:ffff99b284f0be68 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff99b284edd000 RCX: 0000000000000000
 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
 RBP: ffff9271397024e0 R08: 0000000000000000 R09: ffff927139702450
 R10: 0000000000000000 R11: 0000000000000001 R12: ffff99b284f0be98
 R13: 0000000000000000 R14: ffff9270991fcd80 R15: 0000000000000003
 FS:  0000000000000000(0000) GS:ffff927f9f640000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 00007f0aacad3ae0 CR3: 000000088fc2c005 CR4: 00000000003726e0
 Call Trace:
  <TASK>
__pfx_kvm_nx_huge_page_recovery_worker+0x10/0x10 [kvm]
  kvm_vm_worker_thread+0x106/0x1c0 [kvm]
  kthread+0xd9/0x100
  ret_from_fork+0x2c/0x50
  </TASK>
 ---[ end trace 0000000000000000 ]---

This bug was exposed by commit edbdb43fc96b ("KVM: x86: Preserve TDP MMU
roots until they are explicitly invalidated"), which allowed KVM to retain
SMM TDP MMU roots effectively indefinitely.  Before commit edbdb43fc96b,
KVM would zap all SMM TDP MMU roots and thus all SMM TDP MMU shadow pages
once all vCPUs exited SMM, which made the window where this bug (recovering
an SMM NX huge page) could be encountered quite tiny.  To hit the bug, the
NX recovery thread would have to run while at least one vCPU was in SMM.
Most VMs typically only use SMM during boot, and so the problematic shadow
pages were gone by the time the NX recovery thread ran.

Now that KVM preserves TDP MMU roots until they are explicitly invalidated
(e.g. by a memslot deletion), the window to trigger the bug is effectively
never closed because most VMMs don't delete memslots after boot (except
for a handful of special scenarios).

Fixes: eb298605705a ("KVM: x86/mmu: Do not recover dirty-tracked NX Huge Pages")
Reported-by: Fabio Coatti <fabio.coatti@gmail.com>
Closes: https://lore.kernel.org/all/CADpTngX9LESCdHVu_2mQkNGena_Ng2CphWNwsRGSMxzDsTjU2A@mail.gmail.com
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230602010137.784664-1-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/mmu/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index c8961f45e3b1..6eaa3d6994ae 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -7091,7 +7091,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm)
 		 */
 		slot = NULL;
 		if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
-			slot = gfn_to_memslot(kvm, sp->gfn);
+			struct kvm_memslots *slots;
+
+			slots = kvm_memslots_for_spte_role(kvm, sp->role);
+			slot = __gfn_to_memslot(slots, sp->gfn);
 			WARN_ON_ONCE(!slot);
 		}
 

From b2ce89978889b848a6dd652695dd1887e416f9d2 Mon Sep 17 00:00:00 2001
From: "Maciej S. Szmigiero" <maciej.szmigiero@oracle.com>
Date: Fri, 19 May 2023 13:26:18 +0200
Subject: [PATCH 2/5] KVM: SVM: vNMI pending bit is V_NMI_PENDING_MASK not
 V_NMI_BLOCKING_MASK

While testing Hyper-V enabled Windows Server 2019 guests on Zen4 hardware
I noticed that with vCPU count large enough (> 16) they sometimes froze at
boot.
With vCPU count of 64 they never booted successfully - suggesting some kind
of a race condition.

Since adding "vnmi=0" module parameter made these guests boot successfully
it was clear that the problem is most likely (v)NMI-related.

Running kvm-unit-tests quickly showed failing NMI-related tests cases, like
"multiple nmi" and "pending nmi" from apic-split, x2apic and xapic tests
and the NMI parts of eventinj test.

The issue was that once one NMI was being serviced no other NMI was allowed
to be set pending (NMI limit = 0), which was traced to
svm_is_vnmi_pending() wrongly testing for the "NMI blocked" flag rather
than for the "NMI pending" flag.

Fix this by testing for the right flag in svm_is_vnmi_pending().
Once this is done, the NMI-related kvm-unit-tests pass successfully and
the Windows guest no longer freezes at boot.

Fixes: fa4c027a7956 ("KVM: x86: Add support for SVM's Virtual NMI")
Signed-off-by: Maciej S. Szmigiero <maciej.szmigiero@oracle.com>
Reviewed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/be4ca192eb0c1e69a210db3009ca984e6a54ae69.1684495380.git.maciej.szmigiero@oracle.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/svm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ca32389f3c36..54089f990c8f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -3510,7 +3510,7 @@ static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu)
 	if (!is_vnmi_enabled(svm))
 		return false;
 
-	return !!(svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK);
+	return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK);
 }
 
 static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu)

From 8b703a49c9df5e74870381ad7ba9c85d8a74ed2c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 1 Jun 2023 18:19:19 -0700
Subject: [PATCH 3/5] KVM: x86: Account fastpath-only VM-Exits in vCPU stats

Increment vcpu->stat.exits when handling a fastpath VM-Exit without
going through any part of the "slow" path.  Not bumping the exits stat
can result in wildly misleading exit counts, e.g. if the primary reason
the guest is exiting is to program the TSC deadline timer.

Fixes: 404d5d7bff0d ("KVM: X86: Introduce more exit_fastpath_completion enum values")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230602011920.787844-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/x86.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0778ca39650..04b57a336b34 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10758,6 +10758,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
 			break;
 		}
+
+		/* Note, VM-Exits that go down the "slow" path are accounted below. */
+		++vcpu->stat.exits;
 	}
 
 	/*

From 4364b287982bd05bfafa461c80650c732001974b Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Fri, 2 Jun 2023 16:32:48 -0700
Subject: [PATCH 4/5] KVM: x86: Bail from kvm_recalculate_phys_map() if x2APIC
 ID is out-of-bounds

Bail from kvm_recalculate_phys_map() and disable the optimized map if the
target vCPU's x2APIC ID is out-of-bounds, i.e. if the vCPU was added
and/or enabled its local APIC after the map was allocated.  This fixes an
out-of-bounds access bug in the !x2apic_format path where KVM would write
beyond the end of phys_map.

Check the x2APIC ID regardless of whether or not x2APIC is enabled,
as KVM's hardcodes x2APIC ID to be the vCPU ID, i.e. it can't change, and
the map allocation in kvm_recalculate_apic_map() doesn't check for x2APIC
being enabled, i.e. the check won't get false postivies.

Note, this also affects the x2apic_format path, which previously just
ignored the "x2apic_id > new->max_apic_id" case.  That too is arguably a
bug fix, as ignoring the vCPU meant that KVM would not send interrupts to
the vCPU until the next map recalculation.  In practice, that "bug" is
likely benign as a newly present vCPU/APIC would immediately trigger a
recalc.  But, there's no functional downside to disabling the map, and
a future patch will gracefully handle the -E2BIG case by retrying instead
of simply disabling the optimized map.

Opportunistically add a sanity check on the xAPIC ID size, along with a
comment explaining why the xAPIC ID is guaranteed to be "good".

Reported-by: Michal Luczaj <mhal@rbox.co>
Fixes: 5b84b0291702 ("KVM: x86: Honor architectural behavior for aliased 8-bit APIC IDs")
Cc: stable@vger.kernel.org
Link: https://lore.kernel.org/r/20230602233250.1014316-2-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/lapic.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e542cf285b51..3c300a196bdf 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -228,6 +228,23 @@ static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
 	u32 xapic_id = kvm_xapic_id(apic);
 	u32 physical_id;
 
+	/*
+	 * For simplicity, KVM always allocates enough space for all possible
+	 * xAPIC IDs.  Yell, but don't kill the VM, as KVM can continue on
+	 * without the optimized map.
+	 */
+	if (WARN_ON_ONCE(xapic_id > new->max_apic_id))
+		return -EINVAL;
+
+	/*
+	 * Bail if a vCPU was added and/or enabled its APIC between allocating
+	 * the map and doing the actual calculations for the map.  Note, KVM
+	 * hardcodes the x2APIC ID to vcpu_id, i.e. there's no TOCTOU bug if
+	 * the compiler decides to reload x2apic_id after this check.
+	 */
+	if (x2apic_id > new->max_apic_id)
+		return -E2BIG;
+
 	/*
 	 * Deliberately truncate the vCPU ID when detecting a mismatched APIC
 	 * ID to avoid false positives if the vCPU ID, i.e. x2APIC ID, is a
@@ -253,8 +270,7 @@ static int kvm_recalculate_phys_map(struct kvm_apic_map *new,
 	 */
 	if (vcpu->kvm->arch.x2apic_format) {
 		/* See also kvm_apic_match_physical_addr(). */
-		if ((apic_x2apic_mode(apic) || x2apic_id > 0xff) &&
-			x2apic_id <= new->max_apic_id)
+		if (apic_x2apic_mode(apic) || x2apic_id > 0xff)
 			new->phys_map[x2apic_id] = apic;
 
 		if (!apic_x2apic_mode(apic) && !new->phys_map[xapic_id])

From 47d2804bc99ca873470df17c20737b28225a320d Mon Sep 17 00:00:00 2001
From: Michal Luczaj <mhal@rbox.co>
Date: Fri, 2 Jun 2023 16:32:50 -0700
Subject: [PATCH 5/5] KVM: selftests: Add test for race in
 kvm_recalculate_apic_map()

Keep switching between LAPIC_MODE_X2APIC and LAPIC_MODE_DISABLED during
APIC map construction to hunt for TOCTOU bugs in KVM.  KVM's optimized map
recalc makes multiple passes over the list of vCPUs, and the calculations
ignore vCPU's whose APIC is hardware-disabled, i.e. there's a window where
toggling LAPIC_MODE_DISABLED is quite interesting.

Signed-off-by: Michal Luczaj <mhal@rbox.co>
Co-developed-by: Sean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20230602233250.1014316-4-seanjc@google.com
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 tools/testing/selftests/kvm/Makefile          |  1 +
 .../kvm/x86_64/recalc_apic_map_test.c         | 74 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index 7a5ff646e7e7..4761b768b773 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -116,6 +116,7 @@ TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests
 TEST_GEN_PROGS_x86_64 += x86_64/amx_test
 TEST_GEN_PROGS_x86_64 += x86_64/max_vcpuid_cap_test
 TEST_GEN_PROGS_x86_64 += x86_64/triple_fault_event_test
+TEST_GEN_PROGS_x86_64 += x86_64/recalc_apic_map_test
 TEST_GEN_PROGS_x86_64 += access_tracking_perf_test
 TEST_GEN_PROGS_x86_64 += demand_paging_test
 TEST_GEN_PROGS_x86_64 += dirty_log_test
diff --git a/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
new file mode 100644
index 000000000000..4c416ebe7d66
--- /dev/null
+++ b/tools/testing/selftests/kvm/x86_64/recalc_apic_map_test.c
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Test edge cases and race conditions in kvm_recalculate_apic_map().
+ */
+
+#include <sys/ioctl.h>
+#include <pthread.h>
+#include <time.h>
+
+#include "processor.h"
+#include "test_util.h"
+#include "kvm_util.h"
+#include "apic.h"
+
+#define TIMEOUT		5	/* seconds */
+
+#define LAPIC_DISABLED	0
+#define LAPIC_X2APIC	(MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE)
+#define MAX_XAPIC_ID	0xff
+
+static void *race(void *arg)
+{
+	struct kvm_lapic_state lapic = {};
+	struct kvm_vcpu *vcpu = arg;
+
+	while (1) {
+		/* Trigger kvm_recalculate_apic_map(). */
+		vcpu_ioctl(vcpu, KVM_SET_LAPIC, &lapic);
+		pthread_testcancel();
+	}
+
+	return NULL;
+}
+
+int main(void)
+{
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+	struct kvm_vcpu *vcpuN;
+	struct kvm_vm *vm;
+	pthread_t thread;
+	time_t t;
+	int i;
+
+	kvm_static_assert(KVM_MAX_VCPUS > MAX_XAPIC_ID);
+
+	/*
+	 * Create the max number of vCPUs supported by selftests so that KVM
+	 * has decent amount of work to do when recalculating the map, i.e. to
+	 * make the problematic window large enough to hit.
+	 */
+	vm = vm_create_with_vcpus(KVM_MAX_VCPUS, NULL, vcpus);
+
+	/*
+	 * Enable x2APIC on all vCPUs so that KVM doesn't bail from the recalc
+	 * due to vCPUs having aliased xAPIC IDs (truncated to 8 bits).
+	 */
+	for (i = 0; i < KVM_MAX_VCPUS; i++)
+		vcpu_set_msr(vcpus[i], MSR_IA32_APICBASE, LAPIC_X2APIC);
+
+	ASSERT_EQ(pthread_create(&thread, NULL, race, vcpus[0]), 0);
+
+	vcpuN = vcpus[KVM_MAX_VCPUS - 1];
+	for (t = time(NULL) + TIMEOUT; time(NULL) < t;) {
+		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_X2APIC);
+		vcpu_set_msr(vcpuN, MSR_IA32_APICBASE, LAPIC_DISABLED);
+	}
+
+	ASSERT_EQ(pthread_cancel(thread), 0);
+	ASSERT_EQ(pthread_join(thread, NULL), 0);
+
+	kvm_vm_free(vm);
+
+	return 0;
+}