Remove from guest code the handling of task migration during a

pvclock read; instead use the correct protocol in KVM. This removes the need for task migration notifiers in core scheduler code. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJVQkUWAAoJEL/70l94x66DhfcH/A8RTHUOELtoy+v2weahn21m FFWEnEUlCWzYgmiddgFdlr6+ub386W3ryFsXKPqjrn/8LVv3yS7tK1NJF8d03LQw n7HtIsrF01E9UI8CIWO4S/mUxWQev6vEJ9NXtNrsJcRmhSeLaIZkPjTH8Zqyx4i9 ZvG4731WHXmxvbJ03bfJU9Y8OwHXe55GMi614aTxPndVBGdvIRu2Oj6aTfQTeab/ 7tEujub0MKWp74a7eyNU4GItcvIAXZCQt2wMc5dN1VK3ma5FTOnHIOuhAb8mACFF qEeGhtxAnOf7W+s9J8i7zVBdA5MOS0vUKng361ZOVGDb0OLqcVADW7GpuTZfRAM= =2A7v -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm changes from Paolo Bonzini: "Remove from guest code the handling of task migration during a pvclock read; instead use the correct protocol in KVM. This removes the need for task migration notifiers in core scheduler code" [ The scheduler people really hated the migration notifiers, so this was kind of required - Linus ] * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: x86: pvclock: Really remove the sched notifier for cross-cpu migrations kvm: x86: fix kvmclock update protocol
2015-04-30 09:44:04 -07:00 · 2015-04-30 09:44:04 -07:00 · 9dbbe3cfc3
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
 	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
 static struct pvclock_vsyscall_time_info *
 pvclock_get_vsyscall_user_time_info(int cpu)
 {
 	if (!pvclock_vdso_info) {
 		BUG();
 		return NULL;
 	}
 	return &pvclock_vdso_info[cpu];
 }
 struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
 {
 	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
 }
 #ifdef CONFIG_X86_64
 static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
 			        void *v)
 {
 	struct task_migration_notifier *mn = v;
 	struct pvclock_vsyscall_time_info *pvti;
 	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
 	/* this is NULL when pvclock vsyscall is not initialized */
 	if (unlikely(pvti == NULL))
 		return NOTIFY_DONE;
 	pvti->migrate_count++;
 	return NOTIFY_DONE;
 }
 static struct notifier_block pvclock_migrate = {
 	.notifier_call = pvclock_task_migrate,
 };
 /*
 * Initialize the generic pvclock vsyscall state.  This will allocate
 * a/some page(s) for the per-vcpu pvclock information, set up a
@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 	pvclock_vdso_info = i;
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}
 	register_task_migration_notifier(&pvclock_migrate);
 	return 0;
 }
 #endif
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		&guest_hv_clock, sizeof(guest_hv_clock))))
 		return 0;
-	/*
+	/* This VCPU is paused, but it's legal for a guest to read another
-	 * The interface expects us to write an even number signaling that the
+	 * VCPU's kvmclock, so we really have to follow the specification where
-	 * update is finished. Since the guest won't see the intermediate
+	 * it says that version is odd if data is being modified, and even after
-	 * state, we just increase by 2 at the end.
+	 * it is consistent.
 	 *
 	 * Version field updates must be kept separate.  This is because
 	 * kvm_write_guest_cached might use a "rep movs" instruction, and
 	 * writes within a string instruction are weakly ordered.  So there
 	 * are three writes overall.
 	 *
 	 * As a small optimization, only write the version field in the first
 	 * and third write.  The vcpu->pv_time cache is still valid, because the
 	 * version field is the first in the struct.
 	 */
-	vcpu->hv_clock.version = guest_hv_clock.version + 2;
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
 	vcpu->hv_clock.version = guest_hv_clock.version + 1;
 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 				&vcpu->hv_clock,
 				sizeof(vcpu->hv_clock.version));
 	smp_wmb();
 	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
 	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 				&vcpu->hv_clock,
 				sizeof(vcpu->hv_clock));
 	smp_wmb();
 	vcpu->hv_clock.version++;
 	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
 				&vcpu->hv_clock,
 				sizeof(vcpu->hv_clock.version));
 	return 0;
 }
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
 	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;
 	/*
-	 * When looping to get a consistent (time-info, tsc) pair, we
+	 * Note: hypervisor must guarantee that:
-	 * also need to deal with the possibility we can switch vcpus,
+	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * so make sure we always re-fetch time-info for the current vcpu.
+	 * 2. that per-CPU pvclock time info is updated if the
 	 *    underlying CPU changes.
 	 * 3. that version is increased whenever underlying CPU
 	 *    changes.
 	 *
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
 		 * __getcpu() calls (Gleb).
 		 */
-		/* Make sure migrate_count will change if we leave the VCPU. */
+		pvti = get_pvti(cpu);
 		do {
 			pvti = get_pvti(cpu);
 			migrate_count = pvti->migrate_count;
 			cpu1 = cpu;
 			cpu = __getcpu() & VGETCPU_CPU_MASK;
 		} while (unlikely(cpu != cpu1));
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 		/*
 		 * Test we're still on the cpu as well as the version.
-		 * - We must read TSC of pvti's VCPU.
+		 * We could have been migrated just after the first
-		 * - KVM doesn't follow the versioning protocol, so data could
+		 * vgetcpu but before fetching the version, so we
-		 *   change before version if we left the VCPU.
+		 * wouldn't notice a version change.
 		 */
-		smp_rmb();
+		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely((pvti->pvti.version & 1) ||
+	} while (unlikely(cpu != cpu1 ||
-			  pvti->pvti.version != version ||
+			  (pvti->pvti.version & 1) ||
-			  pvti->migrate_count != migrate_count));
+			  pvti->pvti.version != version));
 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 /* Notifier for when a task gets migrated to a new CPU */
 struct task_migration_notifier {
 	struct task_struct *task;
 	int from_cpu;
 	int to_cpu;
 };
 extern void register_task_migration_notifier(struct notifier_block *n);
 extern unsigned long get_parent_ip(unsigned long addr);
 extern void dump_cpu_task(int cpu);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq_clock_skip_update(rq, true);
 }
 static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
 void register_task_migration_notifier(struct notifier_block *n)
 {
 	atomic_notifier_chain_register(&task_migration_notifier, n);
 }
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 	if (task_cpu(p) != new_cpu) {
 		struct task_migration_notifier tmn;
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
 		tmn.task = p;
 		tmn.from_cpu = task_cpu(p);
 		tmn.to_cpu = new_cpu;
 		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}
 	__set_task_cpu(p, new_cpu);