Remove from guest code the handling of task migration during a
pvclock read; instead use the correct protocol in KVM. This removes the need for task migration notifiers in core scheduler code. -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.22 (GNU/Linux) iQEcBAABAgAGBQJVQkUWAAoJEL/70l94x66DhfcH/A8RTHUOELtoy+v2weahn21m FFWEnEUlCWzYgmiddgFdlr6+ub386W3ryFsXKPqjrn/8LVv3yS7tK1NJF8d03LQw n7HtIsrF01E9UI8CIWO4S/mUxWQev6vEJ9NXtNrsJcRmhSeLaIZkPjTH8Zqyx4i9 ZvG4731WHXmxvbJ03bfJU9Y8OwHXe55GMi614aTxPndVBGdvIRu2Oj6aTfQTeab/ 7tEujub0MKWp74a7eyNU4GItcvIAXZCQt2wMc5dN1VK3ma5FTOnHIOuhAb8mACFF qEeGhtxAnOf7W+s9J8i7zVBdA5MOS0vUKng361ZOVGDb0OLqcVADW7GpuTZfRAM= =2A7v -----END PGP SIGNATURE----- Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm Pull kvm changes from Paolo Bonzini: "Remove from guest code the handling of task migration during a pvclock read; instead use the correct protocol in KVM. This removes the need for task migration notifiers in core scheduler code" [ The scheduler people really hated the migration notifiers, so this was kind of required - Linus ] * tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: x86: pvclock: Really remove the sched notifier for cross-cpu migrations kvm: x86: fix kvmclock update protocol
This commit is contained in:
Коммит
9dbbe3cfc3
|
@ -95,7 +95,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
|
||||||
|
|
||||||
struct pvclock_vsyscall_time_info {
|
struct pvclock_vsyscall_time_info {
|
||||||
struct pvclock_vcpu_time_info pvti;
|
struct pvclock_vcpu_time_info pvti;
|
||||||
u32 migrate_count;
|
|
||||||
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
|
} __attribute__((__aligned__(SMP_CACHE_BYTES)));
|
||||||
|
|
||||||
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
|
#define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
|
||||||
|
|
|
@ -141,46 +141,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
|
||||||
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
|
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
|
|
||||||
|
|
||||||
static struct pvclock_vsyscall_time_info *
|
|
||||||
pvclock_get_vsyscall_user_time_info(int cpu)
|
|
||||||
{
|
|
||||||
if (!pvclock_vdso_info) {
|
|
||||||
BUG();
|
|
||||||
return NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
return &pvclock_vdso_info[cpu];
|
|
||||||
}
|
|
||||||
|
|
||||||
struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
|
|
||||||
{
|
|
||||||
return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_X86_64
|
#ifdef CONFIG_X86_64
|
||||||
static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
|
|
||||||
void *v)
|
|
||||||
{
|
|
||||||
struct task_migration_notifier *mn = v;
|
|
||||||
struct pvclock_vsyscall_time_info *pvti;
|
|
||||||
|
|
||||||
pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
|
|
||||||
|
|
||||||
/* this is NULL when pvclock vsyscall is not initialized */
|
|
||||||
if (unlikely(pvti == NULL))
|
|
||||||
return NOTIFY_DONE;
|
|
||||||
|
|
||||||
pvti->migrate_count++;
|
|
||||||
|
|
||||||
return NOTIFY_DONE;
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct notifier_block pvclock_migrate = {
|
|
||||||
.notifier_call = pvclock_task_migrate,
|
|
||||||
};
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Initialize the generic pvclock vsyscall state. This will allocate
|
* Initialize the generic pvclock vsyscall state. This will allocate
|
||||||
* a/some page(s) for the per-vcpu pvclock information, set up a
|
* a/some page(s) for the per-vcpu pvclock information, set up a
|
||||||
|
@ -194,17 +155,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
|
||||||
|
|
||||||
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
|
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
|
||||||
|
|
||||||
pvclock_vdso_info = i;
|
|
||||||
|
|
||||||
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
|
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
|
||||||
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
|
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
|
||||||
__pa(i) + (idx*PAGE_SIZE),
|
__pa(i) + (idx*PAGE_SIZE),
|
||||||
PAGE_KERNEL_VVAR);
|
PAGE_KERNEL_VVAR);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
register_task_migration_notifier(&pvclock_migrate);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1669,12 +1669,28 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||||
&guest_hv_clock, sizeof(guest_hv_clock))))
|
&guest_hv_clock, sizeof(guest_hv_clock))))
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/*
|
/* This VCPU is paused, but it's legal for a guest to read another
|
||||||
* The interface expects us to write an even number signaling that the
|
* VCPU's kvmclock, so we really have to follow the specification where
|
||||||
* update is finished. Since the guest won't see the intermediate
|
* it says that version is odd if data is being modified, and even after
|
||||||
* state, we just increase by 2 at the end.
|
* it is consistent.
|
||||||
|
*
|
||||||
|
* Version field updates must be kept separate. This is because
|
||||||
|
* kvm_write_guest_cached might use a "rep movs" instruction, and
|
||||||
|
* writes within a string instruction are weakly ordered. So there
|
||||||
|
* are three writes overall.
|
||||||
|
*
|
||||||
|
* As a small optimization, only write the version field in the first
|
||||||
|
* and third write. The vcpu->pv_time cache is still valid, because the
|
||||||
|
* version field is the first in the struct.
|
||||||
*/
|
*/
|
||||||
vcpu->hv_clock.version = guest_hv_clock.version + 2;
|
BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
|
||||||
|
|
||||||
|
vcpu->hv_clock.version = guest_hv_clock.version + 1;
|
||||||
|
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||||
|
&vcpu->hv_clock,
|
||||||
|
sizeof(vcpu->hv_clock.version));
|
||||||
|
|
||||||
|
smp_wmb();
|
||||||
|
|
||||||
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
|
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
|
||||||
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
|
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
|
||||||
|
@ -1695,6 +1711,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
|
||||||
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||||
&vcpu->hv_clock,
|
&vcpu->hv_clock,
|
||||||
sizeof(vcpu->hv_clock));
|
sizeof(vcpu->hv_clock));
|
||||||
|
|
||||||
|
smp_wmb();
|
||||||
|
|
||||||
|
vcpu->hv_clock.version++;
|
||||||
|
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
|
||||||
|
&vcpu->hv_clock,
|
||||||
|
sizeof(vcpu->hv_clock.version));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -82,15 +82,18 @@ static notrace cycle_t vread_pvclock(int *mode)
|
||||||
cycle_t ret;
|
cycle_t ret;
|
||||||
u64 last;
|
u64 last;
|
||||||
u32 version;
|
u32 version;
|
||||||
u32 migrate_count;
|
|
||||||
u8 flags;
|
u8 flags;
|
||||||
unsigned cpu, cpu1;
|
unsigned cpu, cpu1;
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When looping to get a consistent (time-info, tsc) pair, we
|
* Note: hypervisor must guarantee that:
|
||||||
* also need to deal with the possibility we can switch vcpus,
|
* 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
|
||||||
* so make sure we always re-fetch time-info for the current vcpu.
|
* 2. that per-CPU pvclock time info is updated if the
|
||||||
|
* underlying CPU changes.
|
||||||
|
* 3. that version is increased whenever underlying CPU
|
||||||
|
* changes.
|
||||||
|
*
|
||||||
*/
|
*/
|
||||||
do {
|
do {
|
||||||
cpu = __getcpu() & VGETCPU_CPU_MASK;
|
cpu = __getcpu() & VGETCPU_CPU_MASK;
|
||||||
|
@ -99,27 +102,20 @@ static notrace cycle_t vread_pvclock(int *mode)
|
||||||
* __getcpu() calls (Gleb).
|
* __getcpu() calls (Gleb).
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/* Make sure migrate_count will change if we leave the VCPU. */
|
pvti = get_pvti(cpu);
|
||||||
do {
|
|
||||||
pvti = get_pvti(cpu);
|
|
||||||
migrate_count = pvti->migrate_count;
|
|
||||||
|
|
||||||
cpu1 = cpu;
|
|
||||||
cpu = __getcpu() & VGETCPU_CPU_MASK;
|
|
||||||
} while (unlikely(cpu != cpu1));
|
|
||||||
|
|
||||||
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
|
version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Test we're still on the cpu as well as the version.
|
* Test we're still on the cpu as well as the version.
|
||||||
* - We must read TSC of pvti's VCPU.
|
* We could have been migrated just after the first
|
||||||
* - KVM doesn't follow the versioning protocol, so data could
|
* vgetcpu but before fetching the version, so we
|
||||||
* change before version if we left the VCPU.
|
* wouldn't notice a version change.
|
||||||
*/
|
*/
|
||||||
smp_rmb();
|
cpu1 = __getcpu() & VGETCPU_CPU_MASK;
|
||||||
} while (unlikely((pvti->pvti.version & 1) ||
|
} while (unlikely(cpu != cpu1 ||
|
||||||
pvti->pvti.version != version ||
|
(pvti->pvti.version & 1) ||
|
||||||
pvti->migrate_count != migrate_count));
|
pvti->pvti.version != version));
|
||||||
|
|
||||||
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
|
if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
|
||||||
*mode = VCLOCK_NONE;
|
*mode = VCLOCK_NONE;
|
||||||
|
|
|
@ -175,14 +175,6 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
|
||||||
extern void calc_global_load(unsigned long ticks);
|
extern void calc_global_load(unsigned long ticks);
|
||||||
extern void update_cpu_load_nohz(void);
|
extern void update_cpu_load_nohz(void);
|
||||||
|
|
||||||
/* Notifier for when a task gets migrated to a new CPU */
|
|
||||||
struct task_migration_notifier {
|
|
||||||
struct task_struct *task;
|
|
||||||
int from_cpu;
|
|
||||||
int to_cpu;
|
|
||||||
};
|
|
||||||
extern void register_task_migration_notifier(struct notifier_block *n);
|
|
||||||
|
|
||||||
extern unsigned long get_parent_ip(unsigned long addr);
|
extern unsigned long get_parent_ip(unsigned long addr);
|
||||||
|
|
||||||
extern void dump_cpu_task(int cpu);
|
extern void dump_cpu_task(int cpu);
|
||||||
|
|
|
@ -1016,13 +1016,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
||||||
rq_clock_skip_update(rq, true);
|
rq_clock_skip_update(rq, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
|
|
||||||
|
|
||||||
void register_task_migration_notifier(struct notifier_block *n)
|
|
||||||
{
|
|
||||||
atomic_notifier_chain_register(&task_migration_notifier, n);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
||||||
{
|
{
|
||||||
|
@ -1053,18 +1046,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
||||||
trace_sched_migrate_task(p, new_cpu);
|
trace_sched_migrate_task(p, new_cpu);
|
||||||
|
|
||||||
if (task_cpu(p) != new_cpu) {
|
if (task_cpu(p) != new_cpu) {
|
||||||
struct task_migration_notifier tmn;
|
|
||||||
|
|
||||||
if (p->sched_class->migrate_task_rq)
|
if (p->sched_class->migrate_task_rq)
|
||||||
p->sched_class->migrate_task_rq(p, new_cpu);
|
p->sched_class->migrate_task_rq(p, new_cpu);
|
||||||
p->se.nr_migrations++;
|
p->se.nr_migrations++;
|
||||||
perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
|
perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
|
||||||
|
|
||||||
tmn.task = p;
|
|
||||||
tmn.from_cpu = task_cpu(p);
|
|
||||||
tmn.to_cpu = new_cpu;
|
|
||||||
|
|
||||||
atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__set_task_cpu(p, new_cpu);
|
__set_task_cpu(p, new_cpu);
|
||||||
|
|
Загрузка…
Ссылка в новой задаче