Merge branch 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.39' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (55 commits)
  KVM: unbreak userspace that does not sets tss address
  KVM: MMU: cleanup pte write path
  KVM: MMU: introduce a common function to get no-dirty-logged slot
  KVM: fix rcu usage in init_rmode_* functions
  KVM: fix kvmclock regression due to missing clock update
  KVM: emulator: Fix permission checking in io permission bitmap
  KVM: emulator: Fix io permission checking for 64bit guest
  KVM: SVM: Load %gs earlier if CONFIG_X86_32_LAZY_GS=n
  KVM: x86: Remove useless regs_page pointer from kvm_lapic
  KVM: improve comment on rcu use in irqfd_deassign
  KVM: MMU: remove unused macros
  KVM: MMU: cleanup page alloc and free
  KVM: MMU: do not record gfn in kvm_mmu_pte_write
  KVM: MMU: move mmu pages calculated out of mmu lock
  KVM: MMU: set spte accessed bit properly
  KVM: MMU: fix kvm_mmu_slot_remove_write_access dropping intermediate W bits
  KVM: Start lock documentation
  KVM: better readability of efer_reserved_bits
  KVM: Clear async page fault hash after switching to real mode
  KVM: VMX: Initialize vm86 TSS only once.
  ...
This commit is contained in:
Linus Torvalds 2011-03-17 18:40:35 -07:00
Родитель 804f185369 776e58ea3d
Коммит ec0afc9311
33 изменённых файлов: 555 добавлений и 401 удалений

Просмотреть файл

@ -0,0 +1,25 @@
KVM Lock Overview
=================
1. Acquisition Orders
---------------------
(to be written)
2. Reference
------------
Name: kvm_lock
Type: raw_spinlock
Arch: any
Protects: - vm_list
- hardware virtualization enable/disable
Comment: 'raw' because hardware enabling/disabling must be atomic /wrt
migration.
Name: kvm_arch::tsc_write_lock
Type: raw_spinlock
Arch: x86
Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
- tsc offset in vmcb
Comment: 'raw' because updating the tsc offsets must not be preempted.

Просмотреть файл

@ -122,4 +122,6 @@
#define ERFKILL 138 /* Operation not possible due to RF-kill */ #define ERFKILL 138 /* Operation not possible due to RF-kill */
#define EHWPOISON 139 /* Memory page has hardware error */
#endif #endif

Просмотреть файл

@ -662,6 +662,7 @@ again:
goto vcpu_run_fail; goto vcpu_run_fail;
srcu_read_unlock(&vcpu->kvm->srcu, idx); srcu_read_unlock(&vcpu->kvm->srcu, idx);
vcpu->mode = IN_GUEST_MODE;
kvm_guest_enter(); kvm_guest_enter();
/* /*
@ -683,6 +684,7 @@ again:
*/ */
barrier(); barrier();
kvm_guest_exit(); kvm_guest_exit();
vcpu->mode = OUTSIDE_GUEST_MODE;
preempt_enable(); preempt_enable();
idx = srcu_read_lock(&vcpu->kvm->srcu); idx = srcu_read_lock(&vcpu->kvm->srcu);

Просмотреть файл

@ -121,6 +121,8 @@
#define ERFKILL 167 /* Operation not possible due to RF-kill */ #define ERFKILL 167 /* Operation not possible due to RF-kill */
#define EHWPOISON 168 /* Memory page has hardware error */
#define EDQUOT 1133 /* Quota exceeded */ #define EDQUOT 1133 /* Quota exceeded */
#ifdef __KERNEL__ #ifdef __KERNEL__

Просмотреть файл

@ -122,4 +122,6 @@
#define ERFKILL 256 /* Operation not possible due to RF-kill */ #define ERFKILL 256 /* Operation not possible due to RF-kill */
#define EHWPOISON 257 /* Memory page has hardware error */
#endif #endif

Просмотреть файл

@ -1141,9 +1141,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg1 = vcpu->arch.shared->sprg1;
regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg2 = vcpu->arch.shared->sprg2;
regs->sprg3 = vcpu->arch.shared->sprg3; regs->sprg3 = vcpu->arch.shared->sprg3;
regs->sprg5 = vcpu->arch.sprg4; regs->sprg4 = vcpu->arch.sprg4;
regs->sprg6 = vcpu->arch.sprg5; regs->sprg5 = vcpu->arch.sprg5;
regs->sprg7 = vcpu->arch.sprg6; regs->sprg6 = vcpu->arch.sprg6;
regs->sprg7 = vcpu->arch.sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i); regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@ -1167,9 +1168,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg1 = regs->sprg1;
vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg2 = regs->sprg2;
vcpu->arch.shared->sprg3 = regs->sprg3; vcpu->arch.shared->sprg3 = regs->sprg3;
vcpu->arch.sprg5 = regs->sprg4; vcpu->arch.sprg4 = regs->sprg4;
vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg5 = regs->sprg5;
vcpu->arch.sprg7 = regs->sprg6; vcpu->arch.sprg6 = regs->sprg6;
vcpu->arch.sprg7 = regs->sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]); kvmppc_set_gpr(vcpu, i, regs->gpr[i]);

Просмотреть файл

@ -546,9 +546,10 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
regs->sprg1 = vcpu->arch.shared->sprg1; regs->sprg1 = vcpu->arch.shared->sprg1;
regs->sprg2 = vcpu->arch.shared->sprg2; regs->sprg2 = vcpu->arch.shared->sprg2;
regs->sprg3 = vcpu->arch.shared->sprg3; regs->sprg3 = vcpu->arch.shared->sprg3;
regs->sprg5 = vcpu->arch.sprg4; regs->sprg4 = vcpu->arch.sprg4;
regs->sprg6 = vcpu->arch.sprg5; regs->sprg5 = vcpu->arch.sprg5;
regs->sprg7 = vcpu->arch.sprg6; regs->sprg6 = vcpu->arch.sprg6;
regs->sprg7 = vcpu->arch.sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
regs->gpr[i] = kvmppc_get_gpr(vcpu, i); regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
@ -572,9 +573,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
vcpu->arch.shared->sprg1 = regs->sprg1; vcpu->arch.shared->sprg1 = regs->sprg1;
vcpu->arch.shared->sprg2 = regs->sprg2; vcpu->arch.shared->sprg2 = regs->sprg2;
vcpu->arch.shared->sprg3 = regs->sprg3; vcpu->arch.shared->sprg3 = regs->sprg3;
vcpu->arch.sprg5 = regs->sprg4; vcpu->arch.sprg4 = regs->sprg4;
vcpu->arch.sprg6 = regs->sprg5; vcpu->arch.sprg5 = regs->sprg5;
vcpu->arch.sprg7 = regs->sprg6; vcpu->arch.sprg6 = regs->sprg6;
vcpu->arch.sprg7 = regs->sprg7;
for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
kvmppc_set_gpr(vcpu, i, regs->gpr[i]); kvmppc_set_gpr(vcpu, i, regs->gpr[i]);

Просмотреть файл

@ -112,4 +112,6 @@
#define ERFKILL 134 /* Operation not possible due to RF-kill */ #define ERFKILL 134 /* Operation not possible due to RF-kill */
#define EHWPOISON 135 /* Memory page has hardware error */
#endif #endif

Просмотреть файл

@ -142,9 +142,9 @@ struct x86_emulate_ops {
int (*pio_out_emulated)(int size, unsigned short port, const void *val, int (*pio_out_emulated)(int size, unsigned short port, const void *val,
unsigned int count, struct kvm_vcpu *vcpu); unsigned int count, struct kvm_vcpu *vcpu);
bool (*get_cached_descriptor)(struct desc_struct *desc, bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3,
int seg, struct kvm_vcpu *vcpu); int seg, struct kvm_vcpu *vcpu);
void (*set_cached_descriptor)(struct desc_struct *desc, void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3,
int seg, struct kvm_vcpu *vcpu); int seg, struct kvm_vcpu *vcpu);
u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
@ -239,6 +239,7 @@ struct x86_emulate_ctxt {
int interruptibility; int interruptibility;
bool perm_ok; /* do not check permissions if true */ bool perm_ok; /* do not check permissions if true */
bool only_vendor_specific_insn;
bool have_exception; bool have_exception;
struct x86_exception exception; struct x86_exception exception;

Просмотреть файл

@ -85,7 +85,7 @@
#define ASYNC_PF_PER_VCPU 64 #define ASYNC_PF_PER_VCPU 64
extern spinlock_t kvm_lock; extern raw_spinlock_t kvm_lock;
extern struct list_head vm_list; extern struct list_head vm_list;
struct kvm_vcpu; struct kvm_vcpu;
@ -255,6 +255,8 @@ struct kvm_mmu {
int (*sync_page)(struct kvm_vcpu *vcpu, int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp); struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte, unsigned long mmu_seq);
hpa_t root_hpa; hpa_t root_hpa;
int root_level; int root_level;
int shadow_root_level; int shadow_root_level;
@ -335,12 +337,6 @@ struct kvm_vcpu_arch {
u64 *last_pte_updated; u64 *last_pte_updated;
gfn_t last_pte_gfn; gfn_t last_pte_gfn;
struct {
gfn_t gfn; /* presumed gfn during guest pte update */
pfn_t pfn; /* pfn corresponding to that gfn */
unsigned long mmu_seq;
} update_pte;
struct fpu guest_fpu; struct fpu guest_fpu;
u64 xcr0; u64 xcr0;
@ -448,7 +444,7 @@ struct kvm_arch {
unsigned long irq_sources_bitmap; unsigned long irq_sources_bitmap;
s64 kvmclock_offset; s64 kvmclock_offset;
spinlock_t tsc_write_lock; raw_spinlock_t tsc_write_lock;
u64 last_tsc_nsec; u64 last_tsc_nsec;
u64 last_tsc_offset; u64 last_tsc_offset;
u64 last_tsc_write; u64 last_tsc_write;

Просмотреть файл

@ -43,6 +43,7 @@
#define MSR_MTRRcap 0x000000fe #define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
#define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_ESP 0x00000175

Просмотреть файл

@ -493,7 +493,7 @@ static void __init kvm_smp_prepare_boot_cpu(void)
native_smp_prepare_boot_cpu(); native_smp_prepare_boot_cpu();
} }
static void kvm_guest_cpu_online(void *dummy) static void __cpuinit kvm_guest_cpu_online(void *dummy)
{ {
kvm_guest_cpu_init(); kvm_guest_cpu_init();
} }

Просмотреть файл

@ -76,6 +76,7 @@
#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
/* Misc flags */ /* Misc flags */
#define VendorSpecific (1<<22) /* Vendor specific instruction */
#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
#define Undefined (1<<25) /* No Such Instruction */ #define Undefined (1<<25) /* No Such Instruction */
@ -877,7 +878,8 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
if (selector & 1 << 2) { if (selector & 1 << 2) {
struct desc_struct desc; struct desc_struct desc;
memset (dt, 0, sizeof *dt); memset (dt, 0, sizeof *dt);
if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR,
ctxt->vcpu))
return; return;
dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
@ -929,6 +931,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
return ret; return ret;
} }
/* Does not support long mode */
static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
struct x86_emulate_ops *ops, struct x86_emulate_ops *ops,
u16 selector, int seg) u16 selector, int seg)
@ -1040,7 +1043,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
} }
load: load:
ops->set_segment_selector(selector, seg, ctxt->vcpu); ops->set_segment_selector(selector, seg, ctxt->vcpu);
ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
return X86EMUL_CONTINUE; return X86EMUL_CONTINUE;
exception: exception:
emulate_exception(ctxt, err_vec, err_code, true); emulate_exception(ctxt, err_vec, err_code, true);
@ -1560,7 +1563,7 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
struct desc_struct *ss) struct desc_struct *ss)
{ {
memset(cs, 0, sizeof(struct desc_struct)); memset(cs, 0, sizeof(struct desc_struct));
ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu);
memset(ss, 0, sizeof(struct desc_struct)); memset(ss, 0, sizeof(struct desc_struct));
cs->l = 0; /* will be adjusted later */ cs->l = 0; /* will be adjusted later */
@ -1607,9 +1610,9 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs.d = 0; cs.d = 0;
cs.l = 1; cs.l = 1;
} }
ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->regs[VCPU_REGS_RCX] = c->eip; c->regs[VCPU_REGS_RCX] = c->eip;
@ -1679,9 +1682,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs.l = 1; cs.l = 1;
} }
ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
@ -1736,9 +1739,9 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
cs_sel |= SELECTOR_RPL_MASK; cs_sel |= SELECTOR_RPL_MASK;
ss_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK;
ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu);
ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
c->eip = c->regs[VCPU_REGS_RDX]; c->eip = c->regs[VCPU_REGS_RDX];
@ -1764,24 +1767,28 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
u16 port, u16 len) u16 port, u16 len)
{ {
struct desc_struct tr_seg; struct desc_struct tr_seg;
u32 base3;
int r; int r;
u16 io_bitmap_ptr; u16 io_bitmap_ptr, perm, bit_idx = port & 0x7;
u8 perm, bit_idx = port & 0x7;
unsigned mask = (1 << len) - 1; unsigned mask = (1 << len) - 1;
unsigned long base;
ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu);
if (!tr_seg.p) if (!tr_seg.p)
return false; return false;
if (desc_limit_scaled(&tr_seg) < 103) if (desc_limit_scaled(&tr_seg) < 103)
return false; return false;
r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, base = get_desc_base(&tr_seg);
ctxt->vcpu, NULL); #ifdef CONFIG_X86_64
base |= ((u64)base3) << 32;
#endif
r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL);
if (r != X86EMUL_CONTINUE) if (r != X86EMUL_CONTINUE)
return false; return false;
if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
return false; return false;
r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu,
&perm, 1, ctxt->vcpu, NULL); NULL);
if (r != X86EMUL_CONTINUE) if (r != X86EMUL_CONTINUE)
return false; return false;
if ((perm >> bit_idx) & mask) if ((perm >> bit_idx) & mask)
@ -2126,7 +2133,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
} }
ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu);
ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu);
ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
if (has_error_code) { if (has_error_code) {
@ -2365,7 +2372,8 @@ static struct group_dual group7 = { {
D(SrcMem16 | ModRM | Mov | Priv), D(SrcMem16 | ModRM | Mov | Priv),
D(SrcMem | ModRM | ByteOp | Priv | NoAccess), D(SrcMem | ModRM | ByteOp | Priv | NoAccess),
}, { }, {
D(SrcNone | ModRM | Priv), N, N, D(SrcNone | ModRM | Priv), D(SrcNone | ModRM | Priv | VendorSpecific), N,
N, D(SrcNone | ModRM | Priv | VendorSpecific),
D(SrcNone | ModRM | DstMem | Mov), N, D(SrcNone | ModRM | DstMem | Mov), N,
D(SrcMem16 | ModRM | Mov | Priv), N, D(SrcMem16 | ModRM | Mov | Priv), N,
} }; } };
@ -2489,7 +2497,7 @@ static struct opcode opcode_table[256] = {
static struct opcode twobyte_table[256] = { static struct opcode twobyte_table[256] = {
/* 0x00 - 0x0F */ /* 0x00 - 0x0F */
N, GD(0, &group7), N, N, N, GD(0, &group7), N, N,
N, D(ImplicitOps), D(ImplicitOps | Priv), N, N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N,
D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N,
N, D(ImplicitOps | ModRM), N, N, N, D(ImplicitOps | ModRM), N, N,
/* 0x10 - 0x1F */ /* 0x10 - 0x1F */
@ -2502,7 +2510,8 @@ static struct opcode twobyte_table[256] = {
/* 0x30 - 0x3F */ /* 0x30 - 0x3F */
D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc),
D(ImplicitOps | Priv), N, D(ImplicitOps | Priv), N,
D(ImplicitOps), D(ImplicitOps | Priv), N, N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
N, N,
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
/* 0x40 - 0x4F */ /* 0x40 - 0x4F */
X16(D(DstReg | SrcMem | ModRM | Mov)), X16(D(DstReg | SrcMem | ModRM | Mov)),
@ -2741,6 +2750,9 @@ done_prefixes:
if (c->d == 0 || (c->d & Undefined)) if (c->d == 0 || (c->d & Undefined))
return -1; return -1;
if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
return -1;
if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
c->op_bytes = 8; c->op_bytes = 8;

Просмотреть файл

@ -61,9 +61,6 @@ static void pic_unlock(struct kvm_pic *s)
} }
} }
if (!found)
found = s->kvm->bsp_vcpu;
if (!found) if (!found)
return; return;
@ -75,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
static void pic_clear_isr(struct kvm_kpic_state *s, int irq) static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
{ {
s->isr &= ~(1 << irq); s->isr &= ~(1 << irq);
s->isr_ack |= (1 << irq);
if (s != &s->pics_state->pics[0]) if (s != &s->pics_state->pics[0])
irq += 8; irq += 8;
/* /*
@ -89,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
pic_lock(s->pics_state); pic_lock(s->pics_state);
} }
void kvm_pic_clear_isr_ack(struct kvm *kvm)
{
struct kvm_pic *s = pic_irqchip(kvm);
pic_lock(s);
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
pic_unlock(s);
}
/* /*
* set irq level. If an edge is detected, then the IRR is set to 1 * set irq level. If an edge is detected, then the IRR is set to 1
*/ */
@ -281,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
s->irr = 0; s->irr = 0;
s->imr = 0; s->imr = 0;
s->isr = 0; s->isr = 0;
s->isr_ack = 0xff;
s->priority_add = 0; s->priority_add = 0;
s->irq_base = 0; s->irq_base = 0;
s->read_reg_select = 0; s->read_reg_select = 0;
@ -545,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
*/ */
static void pic_irq_request(struct kvm *kvm, int level) static void pic_irq_request(struct kvm *kvm, int level)
{ {
struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
struct kvm_pic *s = pic_irqchip(kvm); struct kvm_pic *s = pic_irqchip(kvm);
int irq = pic_get_irq(&s->pics[0]);
s->output = level; if (!s->output)
if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
s->pics[0].isr_ack &= ~(1 << irq);
s->wakeup_needed = true; s->wakeup_needed = true;
} s->output = level;
} }
static const struct kvm_io_device_ops picdev_ops = { static const struct kvm_io_device_ops picdev_ops = {
@ -575,8 +556,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
s->pics[1].elcr_mask = 0xde; s->pics[1].elcr_mask = 0xde;
s->pics[0].pics_state = s; s->pics[0].pics_state = s;
s->pics[1].pics_state = s; s->pics[1].pics_state = s;
s->pics[0].isr_ack = 0xff;
s->pics[1].isr_ack = 0xff;
/* /*
* Initialize PIO device * Initialize PIO device

Просмотреть файл

@ -417,10 +417,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
case APIC_DM_INIT: case APIC_DM_INIT:
if (level) { if (level) {
result = 1; result = 1;
if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
printk(KERN_DEBUG
"INIT on a runnable vcpu %d\n",
vcpu->vcpu_id);
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu); kvm_vcpu_kick(vcpu);
@ -875,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
if (vcpu->arch.apic->regs_page) if (vcpu->arch.apic->regs)
__free_page(vcpu->arch.apic->regs_page); free_page((unsigned long)vcpu->arch.apic->regs);
kfree(vcpu->arch.apic); kfree(vcpu->arch.apic);
} }
@ -1065,13 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
vcpu->arch.apic = apic; vcpu->arch.apic = apic;
apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO); apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
if (apic->regs_page == NULL) { if (!apic->regs) {
printk(KERN_ERR "malloc apic regs error for vcpu %x\n", printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
vcpu->vcpu_id); vcpu->vcpu_id);
goto nomem_free_apic; goto nomem_free_apic;
} }
apic->regs = page_address(apic->regs_page);
apic->vcpu = vcpu; apic->vcpu = vcpu;
hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,

Просмотреть файл

@ -13,7 +13,6 @@ struct kvm_lapic {
u32 divide_count; u32 divide_count;
struct kvm_vcpu *vcpu; struct kvm_vcpu *vcpu;
bool irr_pending; bool irr_pending;
struct page *regs_page;
void *regs; void *regs;
gpa_t vapic_addr; gpa_t vapic_addr;
struct page *vapic_page; struct page *vapic_page;

Просмотреть файл

@ -111,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
#define PT64_LEVEL_SHIFT(level) \ #define PT64_LEVEL_SHIFT(level) \
(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
#define PT64_LEVEL_MASK(level) \
(((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
#define PT64_INDEX(address, level)\ #define PT64_INDEX(address, level)\
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
@ -123,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
#define PT32_LEVEL_SHIFT(level) \ #define PT32_LEVEL_SHIFT(level) \
(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
#define PT32_LEVEL_MASK(level) \
(((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
#define PT32_LVL_OFFSET_MASK(level) \ #define PT32_LVL_OFFSET_MASK(level) \
(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
* PT32_LEVEL_BITS))) - 1)) * PT32_LEVEL_BITS))) - 1))
@ -379,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
int min) int min)
{ {
struct page *page; void *page;
if (cache->nobjs >= min) if (cache->nobjs >= min)
return 0; return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) { while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
page = alloc_page(GFP_KERNEL); page = (void *)__get_free_page(GFP_KERNEL);
if (!page) if (!page)
return -ENOMEM; return -ENOMEM;
cache->objects[cache->nobjs++] = page_address(page); cache->objects[cache->nobjs++] = page;
} }
return 0; return 0;
} }
@ -554,13 +549,23 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
return ret; return ret;
} }
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) static struct kvm_memory_slot *
gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log)
{ {
struct kvm_memory_slot *slot; struct kvm_memory_slot *slot;
slot = gfn_to_memslot(vcpu->kvm, large_gfn);
if (slot && slot->dirty_bitmap) slot = gfn_to_memslot(vcpu->kvm, gfn);
return true; if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
return false; (no_dirty_log && slot->dirty_bitmap))
slot = NULL;
return slot;
}
static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
{
return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
} }
static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@ -1032,9 +1037,9 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
ASSERT(is_empty_shadow_page(sp->spt)); ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link); hlist_del(&sp->hash_link);
list_del(&sp->link); list_del(&sp->link);
__free_page(virt_to_page(sp->spt)); free_page((unsigned long)sp->spt);
if (!sp->role.direct) if (!sp->role.direct)
__free_page(virt_to_page(sp->gfns)); free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp); kmem_cache_free(mmu_page_header_cache, sp);
kvm_mod_used_mmu_pages(kvm, -1); kvm_mod_used_mmu_pages(kvm, -1);
} }
@ -1199,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{ {
} }
static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
const void *pte, unsigned long mmu_seq)
{
WARN_ON(1);
}
#define KVM_PAGE_ARRAY_NR 16 #define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages { struct kvm_mmu_pages {
@ -2150,26 +2162,13 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{ {
} }
static struct kvm_memory_slot *
pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
{
struct kvm_memory_slot *slot;
slot = gfn_to_memslot(vcpu->kvm, gfn);
if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
(no_dirty_log && slot->dirty_bitmap))
slot = NULL;
return slot;
}
static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
bool no_dirty_log) bool no_dirty_log)
{ {
struct kvm_memory_slot *slot; struct kvm_memory_slot *slot;
unsigned long hva; unsigned long hva;
slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) { if (!slot) {
get_page(bad_page); get_page(bad_page);
return page_to_pfn(bad_page); return page_to_pfn(bad_page);
@ -2190,7 +2189,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
gfn_t gfn; gfn_t gfn;
gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
if (!pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK)) if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
return -1; return -1;
ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
@ -2804,6 +2803,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
context->prefetch_page = nonpaging_prefetch_page; context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page; context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg; context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
context->root_level = 0; context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE; context->root_hpa = INVALID_PAGE;
@ -2933,6 +2933,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->prefetch_page = paging64_prefetch_page; context->prefetch_page = paging64_prefetch_page;
context->sync_page = paging64_sync_page; context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg; context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
context->free = paging_free; context->free = paging_free;
context->root_level = level; context->root_level = level;
context->shadow_root_level = level; context->shadow_root_level = level;
@ -2961,6 +2962,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->prefetch_page = paging32_prefetch_page; context->prefetch_page = paging32_prefetch_page;
context->sync_page = paging32_sync_page; context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg; context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
context->root_level = PT32_ROOT_LEVEL; context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL;
context->root_hpa = INVALID_PAGE; context->root_hpa = INVALID_PAGE;
@ -2985,6 +2987,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->prefetch_page = nonpaging_prefetch_page; context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page; context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg; context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->shadow_root_level = kvm_x86_ops->get_tdp_level();
context->root_hpa = INVALID_PAGE; context->root_hpa = INVALID_PAGE;
context->direct_map = true; context->direct_map = true;
@ -3089,8 +3092,6 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
static int init_kvm_mmu(struct kvm_vcpu *vcpu) static int init_kvm_mmu(struct kvm_vcpu *vcpu)
{ {
vcpu->arch.update_pte.pfn = bad_pfn;
if (mmu_is_nested(vcpu)) if (mmu_is_nested(vcpu))
return init_kvm_nested_mmu(vcpu); return init_kvm_nested_mmu(vcpu);
else if (tdp_enabled) else if (tdp_enabled)
@ -3164,7 +3165,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, struct kvm_mmu_page *sp,
u64 *spte, u64 *spte,
const void *new) const void *new, unsigned long mmu_seq)
{ {
if (sp->role.level != PT_PAGE_TABLE_LEVEL) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
++vcpu->kvm->stat.mmu_pde_zapped; ++vcpu->kvm->stat.mmu_pde_zapped;
@ -3172,10 +3173,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
} }
++vcpu->kvm->stat.mmu_pte_updated; ++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae) vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq);
paging32_update_pte(vcpu, sp, spte, new);
else
paging64_update_pte(vcpu, sp, spte, new);
} }
static bool need_remote_flush(u64 old, u64 new) static bool need_remote_flush(u64 old, u64 new)
@ -3210,28 +3208,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
return !!(spte && (*spte & shadow_accessed_mask)); return !!(spte && (*spte & shadow_accessed_mask));
} }
static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
u64 gpte)
{
gfn_t gfn;
pfn_t pfn;
if (!is_present_gpte(gpte))
return;
gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
pfn = gfn_to_pfn(vcpu->kvm, gfn);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return;
}
vcpu->arch.update_pte.gfn = gfn;
vcpu->arch.update_pte.pfn = pfn;
}
static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
{ {
u64 *spte = vcpu->arch.last_pte_updated; u64 *spte = vcpu->arch.last_pte_updated;
@ -3253,21 +3229,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
struct kvm_mmu_page *sp; struct kvm_mmu_page *sp;
struct hlist_node *node; struct hlist_node *node;
LIST_HEAD(invalid_list); LIST_HEAD(invalid_list);
u64 entry, gentry; unsigned long mmu_seq;
u64 *spte; u64 entry, gentry, *spte;
unsigned offset = offset_in_page(gpa); unsigned pte_size, page_offset, misaligned, quadrant, offset;
unsigned pte_size; int level, npte, invlpg_counter, r, flooded = 0;
unsigned page_offset;
unsigned misaligned;
unsigned quadrant;
int level;
int flooded = 0;
int npte;
int r;
int invlpg_counter;
bool remote_flush, local_flush, zap_page; bool remote_flush, local_flush, zap_page;
zap_page = remote_flush = local_flush = false; zap_page = remote_flush = local_flush = false;
offset = offset_in_page(gpa);
pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
@ -3275,9 +3244,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
/* /*
* Assume that the pte write on a page table of the same type * Assume that the pte write on a page table of the same type
* as the current vcpu paging mode. This is nearly always true * as the current vcpu paging mode since we update the sptes only
* (might be false while changing modes). Note it is verified later * when they have the same mode.
* by update_pte().
*/ */
if ((is_pae(vcpu) && bytes == 4) || !new) { if ((is_pae(vcpu) && bytes == 4) || !new) {
/* Handle a 32-bit guest writing two halves of a 64-bit gpte */ /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@ -3303,15 +3271,17 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
break; break;
} }
mmu_guess_page_from_pte_write(vcpu, gpa, gentry); mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
spin_lock(&vcpu->kvm->mmu_lock); spin_lock(&vcpu->kvm->mmu_lock);
if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
gentry = 0; gentry = 0;
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu); kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write; ++vcpu->kvm->stat.mmu_pte_write;
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE); trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
if (guest_initiated) { if (guest_initiated) {
kvm_mmu_access_page(vcpu, gfn);
if (gfn == vcpu->arch.last_pt_write_gfn if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) { && !last_updated_pte_accessed(vcpu)) {
++vcpu->arch.last_pt_write_count; ++vcpu->arch.last_pt_write_count;
@ -3375,7 +3345,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
if (gentry && if (gentry &&
!((sp->role.word ^ vcpu->arch.mmu.base_role.word) !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
& mask.word)) & mask.word))
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); mmu_pte_write_new_pte(vcpu, sp, spte, &gentry,
mmu_seq);
if (!remote_flush && need_remote_flush(entry, *spte)) if (!remote_flush && need_remote_flush(entry, *spte))
remote_flush = true; remote_flush = true;
++spte; ++spte;
@ -3385,10 +3356,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE); trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock); spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
vcpu->arch.update_pte.pfn = bad_pfn;
}
} }
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@ -3538,14 +3505,23 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
if (!test_bit(slot, sp->slot_bitmap)) if (!test_bit(slot, sp->slot_bitmap))
continue; continue;
if (sp->role.level != PT_PAGE_TABLE_LEVEL)
continue;
pt = sp->spt; pt = sp->spt;
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (!is_shadow_present_pte(pt[i]) ||
!is_last_spte(pt[i], sp->role.level))
continue;
if (is_large_pte(pt[i])) {
drop_spte(kvm, &pt[i],
shadow_trap_nonpresent_pte);
--kvm->stat.lpages;
continue;
}
/* avoid RMW */ /* avoid RMW */
if (is_writable_pte(pt[i])) if (is_writable_pte(pt[i]))
update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK); update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
}
} }
kvm_flush_remote_tlbs(kvm); kvm_flush_remote_tlbs(kvm);
} }
@ -3583,7 +3559,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
if (nr_to_scan == 0) if (nr_to_scan == 0)
goto out; goto out;
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) { list_for_each_entry(kvm, &vm_list, vm_list) {
int idx, freed_pages; int idx, freed_pages;
@ -3606,7 +3582,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
if (kvm_freed) if (kvm_freed)
list_move_tail(&kvm_freed->vm_list, &vm_list); list_move_tail(&kvm_freed->vm_list, &vm_list);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
out: out:
return percpu_counter_read_positive(&kvm_total_used_mmu_pages); return percpu_counter_read_positive(&kvm_total_used_mmu_pages);

Просмотреть файл

@ -31,7 +31,6 @@
#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT64_LEVEL_BITS #define PT_LEVEL_BITS PT64_LEVEL_BITS
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS 4 #define PT_MAX_FULL_LEVELS 4
@ -48,7 +47,6 @@
#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
#define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_LEVEL_BITS PT32_LEVEL_BITS
#define PT_MAX_FULL_LEVELS 2 #define PT_MAX_FULL_LEVELS 2
#define CMPXCHG cmpxchg #define CMPXCHG cmpxchg
@ -327,7 +325,7 @@ no_present:
} }
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte) u64 *spte, const void *pte, unsigned long mmu_seq)
{ {
pt_element_t gpte; pt_element_t gpte;
unsigned pte_access; unsigned pte_access;
@ -339,14 +337,14 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return; return;
pfn = vcpu->arch.update_pte.pfn; }
if (is_error_pfn(pfn)) if (mmu_notifier_retry(vcpu, mmu_seq))
return; return;
if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
return;
kvm_get_pfn(pfn);
/* /*
* we call mmu_set_spte() with host_writable = true beacuse that * we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
@ -829,7 +827,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
#undef FNAME #undef FNAME
#undef PT_BASE_ADDR_MASK #undef PT_BASE_ADDR_MASK
#undef PT_INDEX #undef PT_INDEX
#undef PT_LEVEL_MASK
#undef PT_LVL_ADDR_MASK #undef PT_LVL_ADDR_MASK
#undef PT_LVL_OFFSET_MASK #undef PT_LVL_OFFSET_MASK
#undef PT_LEVEL_BITS #undef PT_LEVEL_BITS

Просмотреть файл

@ -135,6 +135,8 @@ struct vcpu_svm {
u32 *msrpm; u32 *msrpm;
ulong nmi_iret_rip;
struct nested_state nested; struct nested_state nested;
bool nmi_singlestep; bool nmi_singlestep;
@ -1153,7 +1155,9 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
load_gs_index(svm->host.gs); load_gs_index(svm->host.gs);
#else #else
#ifdef CONFIG_X86_32_LAZY_GS
loadsegment(gs, svm->host.gs); loadsegment(gs, svm->host.gs);
#endif
#endif #endif
for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
@ -2653,6 +2657,7 @@ static int iret_interception(struct vcpu_svm *svm)
++svm->vcpu.stat.nmi_window_exits; ++svm->vcpu.stat.nmi_window_exits;
clr_intercept(svm, INTERCEPT_IRET); clr_intercept(svm, INTERCEPT_IRET);
svm->vcpu.arch.hflags |= HF_IRET_MASK; svm->vcpu.arch.hflags |= HF_IRET_MASK;
svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
return 1; return 1;
} }
@ -3474,7 +3479,12 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
svm->int3_injected = 0; svm->int3_injected = 0;
if (svm->vcpu.arch.hflags & HF_IRET_MASK) { /*
* If we've made progress since setting HF_IRET_MASK, we've
* executed an IRET and can allow NMI injection.
*/
if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
&& kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu); kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
} }
@ -3641,19 +3651,30 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
wrmsrl(MSR_GS_BASE, svm->host.gs_base); wrmsrl(MSR_GS_BASE, svm->host.gs_base);
#else #else
loadsegment(fs, svm->host.fs); loadsegment(fs, svm->host.fs);
#ifndef CONFIG_X86_32_LAZY_GS
loadsegment(gs, svm->host.gs);
#endif
#endif #endif
reload_tss(vcpu); reload_tss(vcpu);
local_irq_disable(); local_irq_disable();
stgi();
vcpu->arch.cr2 = svm->vmcb->save.cr2; vcpu->arch.cr2 = svm->vmcb->save.cr2;
vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_handle_nmi(&svm->vcpu);
stgi();
/* Any pending NMI will happen here */
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_after_handle_nmi(&svm->vcpu);
sync_cr8_to_lapic(vcpu); sync_cr8_to_lapic(vcpu);
svm->next_rip = 0; svm->next_rip = 0;

Просмотреть файл

@ -93,14 +93,14 @@ module_param(yield_on_hlt, bool, S_IRUGO);
* These 2 parameters are used to config the controls for Pause-Loop Exiting: * These 2 parameters are used to config the controls for Pause-Loop Exiting:
* ple_gap: upper bound on the amount of time between two successive * ple_gap: upper bound on the amount of time between two successive
* executions of PAUSE in a loop. Also indicate if ple enabled. * executions of PAUSE in a loop. Also indicate if ple enabled.
* According to test, this time is usually small than 41 cycles. * According to test, this time is usually smaller than 128 cycles.
* ple_window: upper bound on the amount of time a guest is allowed to execute * ple_window: upper bound on the amount of time a guest is allowed to execute
* in a PAUSE loop. Tests indicate that most spinlocks are held for * in a PAUSE loop. Tests indicate that most spinlocks are held for
* less than 2^12 cycles * less than 2^12 cycles
* Time is measured based on a counter that runs at the same rate as the TSC, * Time is measured based on a counter that runs at the same rate as the TSC,
* refer SDM volume 3b section 21.6.13 & 22.1.3. * refer SDM volume 3b section 21.6.13 & 22.1.3.
*/ */
#define KVM_VMX_DEFAULT_PLE_GAP 41 #define KVM_VMX_DEFAULT_PLE_GAP 128
#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 #define KVM_VMX_DEFAULT_PLE_WINDOW 4096
static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
module_param(ple_gap, int, S_IRUGO); module_param(ple_gap, int, S_IRUGO);
@ -176,11 +176,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
return container_of(vcpu, struct vcpu_vmx, vcpu); return container_of(vcpu, struct vcpu_vmx, vcpu);
} }
static int init_rmode(struct kvm *kvm);
static u64 construct_eptp(unsigned long root_hpa); static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr); static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void); static void kvm_cpu_vmxoff(void);
static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs); static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@ -1333,19 +1333,25 @@ static __init int vmx_disabled_by_bios(void)
rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
if (msr & FEATURE_CONTROL_LOCKED) { if (msr & FEATURE_CONTROL_LOCKED) {
/* launched w/ TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& tboot_enabled()) && tboot_enabled())
return 1; return 1;
/* launched w/o TXT and VMX only enabled w/ TXT */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
&& !tboot_enabled()) { && !tboot_enabled()) {
printk(KERN_WARNING "kvm: disable TXT in the BIOS or " printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
" activate TXT before enabling KVM\n"); "activate TXT before enabling KVM\n");
return 1; return 1;
} }
/* launched w/o TXT and VMX disabled */
if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
&& !tboot_enabled())
return 1;
} }
return 0; return 0;
/* locked but not enabled */
} }
static void kvm_cpu_vmxon(u64 addr) static void kvm_cpu_vmxon(u64 addr)
@ -1683,6 +1689,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
vmx->emulation_required = 1; vmx->emulation_required = 1;
vmx->rmode.vm86_active = 0; vmx->rmode.vm86_active = 0;
vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@ -1756,6 +1763,19 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
vmx->emulation_required = 1; vmx->emulation_required = 1;
vmx->rmode.vm86_active = 1; vmx->rmode.vm86_active = 1;
/*
* Very old userspace does not call KVM_SET_TSS_ADDR before entering
* vcpu. Call it here with phys address pointing 16M below 4G.
*/
if (!vcpu->kvm->arch.tss_addr) {
printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
"called before entering vcpu\n");
srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
}
vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@ -1794,7 +1814,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
continue_rmode: continue_rmode:
kvm_mmu_reset_context(vcpu); kvm_mmu_reset_context(vcpu);
init_rmode(vcpu->kvm);
} }
static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@ -2030,23 +2049,40 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
vmcs_writel(GUEST_CR4, hw_cr4); vmcs_writel(GUEST_CR4, hw_cr4);
} }
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
return vmcs_readl(sf->base);
}
static void vmx_get_segment(struct kvm_vcpu *vcpu, static void vmx_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg) struct kvm_segment *var, int seg)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_save_segment *save;
u32 ar; u32 ar;
if (vmx->rmode.vm86_active
&& (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
|| seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
|| seg == VCPU_SREG_GS)
&& !emulate_invalid_guest_state) {
switch (seg) {
case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
case VCPU_SREG_ES: save = &vmx->rmode.es; break;
case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
default: BUG();
}
var->selector = save->selector;
var->base = save->base;
var->limit = save->limit;
ar = save->ar;
if (seg == VCPU_SREG_TR
|| var->selector == vmcs_read16(sf->selector))
goto use_saved_rmode_seg;
}
var->base = vmcs_readl(sf->base); var->base = vmcs_readl(sf->base);
var->limit = vmcs_read32(sf->limit); var->limit = vmcs_read32(sf->limit);
var->selector = vmcs_read16(sf->selector); var->selector = vmcs_read16(sf->selector);
ar = vmcs_read32(sf->ar_bytes); ar = vmcs_read32(sf->ar_bytes);
use_saved_rmode_seg:
if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
ar = 0; ar = 0;
var->type = ar & 15; var->type = ar & 15;
@ -2060,6 +2096,18 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
var->unusable = (ar >> 16) & 1; var->unusable = (ar >> 16) & 1;
} }
static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
struct kvm_segment s;
if (to_vmx(vcpu)->rmode.vm86_active) {
vmx_get_segment(vcpu, &s, seg);
return s.base;
}
return vmcs_readl(sf->base);
}
static int vmx_get_cpl(struct kvm_vcpu *vcpu) static int vmx_get_cpl(struct kvm_vcpu *vcpu)
{ {
if (!is_protmode(vcpu)) if (!is_protmode(vcpu))
@ -2101,6 +2149,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
u32 ar; u32 ar;
if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
vmcs_write16(sf->selector, var->selector);
vmx->rmode.tr.selector = var->selector; vmx->rmode.tr.selector = var->selector;
vmx->rmode.tr.base = var->base; vmx->rmode.tr.base = var->base;
vmx->rmode.tr.limit = var->limit; vmx->rmode.tr.limit = var->limit;
@ -2361,11 +2410,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
static int init_rmode_tss(struct kvm *kvm) static int init_rmode_tss(struct kvm *kvm)
{ {
gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; gfn_t fn;
u16 data = 0; u16 data = 0;
int ret = 0; int r, idx, ret = 0;
int r;
idx = srcu_read_lock(&kvm->srcu);
fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
if (r < 0) if (r < 0)
goto out; goto out;
@ -2389,12 +2439,13 @@ static int init_rmode_tss(struct kvm *kvm)
ret = 1; ret = 1;
out: out:
srcu_read_unlock(&kvm->srcu, idx);
return ret; return ret;
} }
static int init_rmode_identity_map(struct kvm *kvm) static int init_rmode_identity_map(struct kvm *kvm)
{ {
int i, r, ret; int i, idx, r, ret;
pfn_t identity_map_pfn; pfn_t identity_map_pfn;
u32 tmp; u32 tmp;
@ -2409,6 +2460,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
return 1; return 1;
ret = 0; ret = 0;
identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
idx = srcu_read_lock(&kvm->srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0) if (r < 0)
goto out; goto out;
@ -2424,6 +2476,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
kvm->arch.ept_identity_pagetable_done = true; kvm->arch.ept_identity_pagetable_done = true;
ret = 1; ret = 1;
out: out:
srcu_read_unlock(&kvm->srcu, idx);
return ret; return ret;
} }
@ -2699,22 +2752,6 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
return 0; return 0;
} }
static int init_rmode(struct kvm *kvm)
{
int idx, ret = 0;
idx = srcu_read_lock(&kvm->srcu);
if (!init_rmode_tss(kvm))
goto exit;
if (!init_rmode_identity_map(kvm))
goto exit;
ret = 1;
exit:
srcu_read_unlock(&kvm->srcu, idx);
return ret;
}
static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -2722,10 +2759,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
int ret; int ret;
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
if (!init_rmode(vmx->vcpu.kvm)) {
ret = -ENOMEM;
goto out;
}
vmx->rmode.vm86_active = 0; vmx->rmode.vm86_active = 0;
@ -2805,7 +2838,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
if (vm_need_tpr_shadow(vmx->vcpu.kvm)) if (vm_need_tpr_shadow(vmx->vcpu.kvm))
vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
page_to_phys(vmx->vcpu.arch.apic->regs_page)); __pa(vmx->vcpu.arch.apic->regs));
vmcs_write32(TPR_THRESHOLD, 0); vmcs_write32(TPR_THRESHOLD, 0);
} }
@ -2971,6 +3004,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
if (ret) if (ret)
return ret; return ret;
kvm->arch.tss_addr = addr; kvm->arch.tss_addr = addr;
if (!init_rmode_tss(kvm))
return -ENOMEM;
return 0; return 0;
} }
@ -3962,7 +3998,7 @@ static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
#define Q "l" #define Q "l"
#endif #endif
static void vmx_vcpu_run(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{ {
struct vcpu_vmx *vmx = to_vmx(vcpu); struct vcpu_vmx *vmx = to_vmx(vcpu);
@ -3991,6 +4027,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
asm( asm(
/* Store host registers */ /* Store host registers */
"push %%"R"dx; push %%"R"bp;" "push %%"R"dx; push %%"R"bp;"
"push %%"R"cx \n\t" /* placeholder for guest rcx */
"push %%"R"cx \n\t" "push %%"R"cx \n\t"
"cmp %%"R"sp, %c[host_rsp](%0) \n\t" "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
"je 1f \n\t" "je 1f \n\t"
@ -4032,10 +4069,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
".Lkvm_vmx_return: " ".Lkvm_vmx_return: "
/* Save guest registers, load host registers, keep flags */ /* Save guest registers, load host registers, keep flags */
"xchg %0, (%%"R"sp) \n\t" "mov %0, %c[wordsize](%%"R"sp) \n\t"
"pop %0 \n\t"
"mov %%"R"ax, %c[rax](%0) \n\t" "mov %%"R"ax, %c[rax](%0) \n\t"
"mov %%"R"bx, %c[rbx](%0) \n\t" "mov %%"R"bx, %c[rbx](%0) \n\t"
"push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" "pop"Q" %c[rcx](%0) \n\t"
"mov %%"R"dx, %c[rdx](%0) \n\t" "mov %%"R"dx, %c[rdx](%0) \n\t"
"mov %%"R"si, %c[rsi](%0) \n\t" "mov %%"R"si, %c[rsi](%0) \n\t"
"mov %%"R"di, %c[rdi](%0) \n\t" "mov %%"R"di, %c[rdi](%0) \n\t"
@ -4053,7 +4091,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
"mov %%cr2, %%"R"ax \n\t" "mov %%cr2, %%"R"ax \n\t"
"mov %%"R"ax, %c[cr2](%0) \n\t" "mov %%"R"ax, %c[cr2](%0) \n\t"
"pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" "pop %%"R"bp; pop %%"R"dx \n\t"
"setbe %c[fail](%0) \n\t" "setbe %c[fail](%0) \n\t"
: : "c"(vmx), "d"((unsigned long)HOST_RSP), : : "c"(vmx), "d"((unsigned long)HOST_RSP),
[launched]"i"(offsetof(struct vcpu_vmx, launched)), [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@ -4076,7 +4114,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
[r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
[r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
#endif #endif
[cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
[wordsize]"i"(sizeof(ulong))
: "cc", "memory" : "cc", "memory"
, R"ax", R"bx", R"di", R"si" , R"ax", R"bx", R"di", R"si"
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
@ -4183,8 +4222,11 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
if (!kvm->arch.ept_identity_map_addr) if (!kvm->arch.ept_identity_map_addr)
kvm->arch.ept_identity_map_addr = kvm->arch.ept_identity_map_addr =
VMX_EPT_IDENTITY_PAGETABLE_ADDR; VMX_EPT_IDENTITY_PAGETABLE_ADDR;
err = -ENOMEM;
if (alloc_identity_pagetable(kvm) != 0) if (alloc_identity_pagetable(kvm) != 0)
goto free_vmcs; goto free_vmcs;
if (!init_rmode_identity_map(kvm))
goto free_vmcs;
} }
return &vmx->vcpu; return &vmx->vcpu;

Просмотреть файл

@ -81,9 +81,10 @@
* - enable LME and LMA per default on 64 bit KVM * - enable LME and LMA per default on 64 bit KVM
*/ */
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; static
u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
#else #else
static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif #endif
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM #define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@ -360,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
void kvm_inject_nmi(struct kvm_vcpu *vcpu) void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{ {
kvm_make_request(KVM_REQ_NMI, vcpu);
kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.nmi_pending = 1;
} }
EXPORT_SYMBOL_GPL(kvm_inject_nmi); EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@ -525,8 +526,10 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
kvm_x86_ops->set_cr0(vcpu, cr0); kvm_x86_ops->set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG) if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu); kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
}
if ((cr0 ^ old_cr0) & update_bits) if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu); kvm_mmu_reset_context(vcpu);
@ -1017,7 +1020,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
unsigned long flags; unsigned long flags;
s64 sdiff; s64 sdiff;
spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = data - native_read_tsc(); offset = data - native_read_tsc();
ns = get_kernel_ns(); ns = get_kernel_ns();
elapsed = ns - kvm->arch.last_tsc_nsec; elapsed = ns - kvm->arch.last_tsc_nsec;
@ -1050,7 +1053,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
kvm->arch.last_tsc_write = data; kvm->arch.last_tsc_write = data;
kvm->arch.last_tsc_offset = offset; kvm->arch.last_tsc_offset = offset;
kvm_x86_ops->write_tsc_offset(vcpu, offset); kvm_x86_ops->write_tsc_offset(vcpu, offset);
spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
/* Reset of TSC must disable overshoot protection below */ /* Reset of TSC must disable overshoot protection below */
vcpu->arch.hv_clock.tsc_timestamp = 0; vcpu->arch.hv_clock.tsc_timestamp = 0;
@ -1453,6 +1456,14 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
return 0; return 0;
} }
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.time_page) {
kvm_release_page_dirty(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{ {
switch (msr) { switch (msr) {
@ -1510,10 +1521,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
break; break;
case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME_NEW:
case MSR_KVM_SYSTEM_TIME: { case MSR_KVM_SYSTEM_TIME: {
if (vcpu->arch.time_page) { kvmclock_reset(vcpu);
kvm_release_page_dirty(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
vcpu->arch.time = data; vcpu->arch.time = data;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@ -1592,6 +1600,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
} else } else
return set_msr_hyperv(vcpu, msr, data); return set_msr_hyperv(vcpu, msr, data);
break; break;
case MSR_IA32_BBL_CR_CTL3:
/* Drop writes to this legacy MSR -- see rdmsr
* counterpart for further detail.
*/
pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
break;
default: default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data); return xen_hvm_config(vcpu, data);
@ -1846,6 +1860,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
} else } else
return get_msr_hyperv(vcpu, msr, pdata); return get_msr_hyperv(vcpu, msr, pdata);
break; break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
* silicon. It is however accessed by winxp in very narrow
* scenarios where it sets bit #19, itself documented as
* a "reserved" bit. Best effort attempt to source coherent
* read data here should the balance of the register be
* interpreted by the guest:
*
* L2 cache control register 3: 64GB range, 256KB size,
* enabled, latency 0x1, configured
*/
data = 0xbe702111;
break;
default: default:
if (!ignore_msrs) { if (!ignore_msrs) {
pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@ -2100,8 +2127,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (check_tsc_unstable()) { if (check_tsc_unstable()) {
kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
vcpu->arch.tsc_catchup = 1; vcpu->arch.tsc_catchup = 1;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
} }
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
if (vcpu->cpu != cpu) if (vcpu->cpu != cpu)
kvm_migrate_timers(vcpu); kvm_migrate_timers(vcpu);
vcpu->cpu = cpu; vcpu->cpu = cpu;
@ -2575,9 +2602,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
if (mce->status & MCI_STATUS_UC) { if (mce->status & MCI_STATUS_UC) {
if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
printk(KERN_DEBUG "kvm: set_mce: "
"injects mce exception while "
"previous one is in progress!\n");
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
return 0; return 0;
} }
@ -2648,8 +2672,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.interrupt.pending = events->interrupt.injected; vcpu->arch.interrupt.pending = events->interrupt.injected;
vcpu->arch.interrupt.nr = events->interrupt.nr; vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft; vcpu->arch.interrupt.soft = events->interrupt.soft;
if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
kvm_pic_clear_isr_ack(vcpu->kvm);
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
kvm_x86_ops->set_interrupt_shadow(vcpu, kvm_x86_ops->set_interrupt_shadow(vcpu,
events->interrupt.shadow); events->interrupt.shadow);
@ -4140,8 +4162,8 @@ static unsigned long emulator_get_cached_segment_base(int seg,
return get_segment_base(vcpu, seg); return get_segment_base(vcpu, seg);
} }
static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
struct kvm_vcpu *vcpu) int seg, struct kvm_vcpu *vcpu)
{ {
struct kvm_segment var; struct kvm_segment var;
@ -4154,6 +4176,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
var.limit >>= 12; var.limit >>= 12;
set_desc_limit(desc, var.limit); set_desc_limit(desc, var.limit);
set_desc_base(desc, (unsigned long)var.base); set_desc_base(desc, (unsigned long)var.base);
#ifdef CONFIG_X86_64
if (base3)
*base3 = var.base >> 32;
#endif
desc->type = var.type; desc->type = var.type;
desc->s = var.s; desc->s = var.s;
desc->dpl = var.dpl; desc->dpl = var.dpl;
@ -4166,8 +4192,8 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
return true; return true;
} }
static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
struct kvm_vcpu *vcpu) int seg, struct kvm_vcpu *vcpu)
{ {
struct kvm_segment var; struct kvm_segment var;
@ -4175,6 +4201,9 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
kvm_get_segment(vcpu, &var, seg); kvm_get_segment(vcpu, &var, seg);
var.base = get_desc_base(desc); var.base = get_desc_base(desc);
#ifdef CONFIG_X86_64
var.base |= ((u64)base3) << 32;
#endif
var.limit = get_desc_limit(desc); var.limit = get_desc_limit(desc);
if (desc->g) if (desc->g)
var.limit = (var.limit << 12) | 0xfff; var.limit = (var.limit << 12) | 0xfff;
@ -4390,41 +4419,16 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
vcpu->arch.emulate_ctxt.have_exception = false; vcpu->arch.emulate_ctxt.have_exception = false;
vcpu->arch.emulate_ctxt.perm_ok = false; vcpu->arch.emulate_ctxt.perm_ok = false;
vcpu->arch.emulate_ctxt.only_vendor_specific_insn
= emulation_type & EMULTYPE_TRAP_UD;
r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len); r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
if (r == X86EMUL_PROPAGATE_FAULT)
goto done;
trace_kvm_emulate_insn_start(vcpu); trace_kvm_emulate_insn_start(vcpu);
/* Only allow emulation of specific instructions on #UD
* (namely VMMCALL, sysenter, sysexit, syscall)*/
if (emulation_type & EMULTYPE_TRAP_UD) {
if (!c->twobyte)
return EMULATE_FAIL;
switch (c->b) {
case 0x01: /* VMMCALL */
if (c->modrm_mod != 3 || c->modrm_rm != 1)
return EMULATE_FAIL;
break;
case 0x34: /* sysenter */
case 0x35: /* sysexit */
if (c->modrm_mod != 0 || c->modrm_rm != 0)
return EMULATE_FAIL;
break;
case 0x05: /* syscall */
if (c->modrm_mod != 0 || c->modrm_rm != 0)
return EMULATE_FAIL;
break;
default:
return EMULATE_FAIL;
}
if (!(c->modrm_reg == 0 || c->modrm_reg == 3))
return EMULATE_FAIL;
}
++vcpu->stat.insn_emulation; ++vcpu->stat.insn_emulation;
if (r) { if (r) {
if (emulation_type & EMULTYPE_TRAP_UD)
return EMULATE_FAIL;
if (reexecute_instruction(vcpu, cr2)) if (reexecute_instruction(vcpu, cr2))
return EMULATE_DONE; return EMULATE_DONE;
if (emulation_type & EMULTYPE_SKIP) if (emulation_type & EMULTYPE_SKIP)
@ -4452,7 +4456,6 @@ restart:
return handle_emulation_failure(vcpu); return handle_emulation_failure(vcpu);
} }
done:
if (vcpu->arch.emulate_ctxt.have_exception) { if (vcpu->arch.emulate_ctxt.have_exception) {
inject_emulated_exception(vcpu); inject_emulated_exception(vcpu);
r = EMULATE_DONE; r = EMULATE_DONE;
@ -4562,7 +4565,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) { list_for_each_entry(kvm, &vm_list, vm_list) {
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
if (vcpu->cpu != freq->cpu) if (vcpu->cpu != freq->cpu)
@ -4572,7 +4575,7 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
send_ipi = 1; send_ipi = 1;
} }
} }
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
if (freq->old < freq->new && send_ipi) { if (freq->old < freq->new && send_ipi) {
/* /*
@ -5185,6 +5188,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
r = 1; r = 1;
goto out; goto out;
} }
if (kvm_check_request(KVM_REQ_NMI, vcpu))
vcpu->arch.nmi_pending = true;
} }
r = kvm_mmu_reload(vcpu); r = kvm_mmu_reload(vcpu);
@ -5213,14 +5218,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_load_guest_fpu(vcpu); kvm_load_guest_fpu(vcpu);
kvm_load_guest_xcr0(vcpu); kvm_load_guest_xcr0(vcpu);
atomic_set(&vcpu->guest_mode, 1); vcpu->mode = IN_GUEST_MODE;
smp_wmb();
/* We should set ->mode before check ->requests,
* see the comment in make_all_cpus_request.
*/
smp_mb();
local_irq_disable(); local_irq_disable();
if (!atomic_read(&vcpu->guest_mode) || vcpu->requests if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
|| need_resched() || signal_pending(current)) { || need_resched() || signal_pending(current)) {
atomic_set(&vcpu->guest_mode, 0); vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb(); smp_wmb();
local_irq_enable(); local_irq_enable();
preempt_enable(); preempt_enable();
@ -5256,7 +5265,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
atomic_set(&vcpu->guest_mode, 0); vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb(); smp_wmb();
local_irq_enable(); local_irq_enable();
@ -5574,7 +5583,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs) struct kvm_sregs *sregs)
{ {
int mmu_reset_needed = 0; int mmu_reset_needed = 0;
int pending_vec, max_bits; int pending_vec, max_bits, idx;
struct desc_ptr dt; struct desc_ptr dt;
dt.size = sregs->idt.limit; dt.size = sregs->idt.limit;
@ -5603,10 +5612,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
kvm_x86_ops->set_cr4(vcpu, sregs->cr4); kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
if (sregs->cr4 & X86_CR4_OSXSAVE) if (sregs->cr4 & X86_CR4_OSXSAVE)
update_cpuid(vcpu); update_cpuid(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (!is_long_mode(vcpu) && is_pae(vcpu)) { if (!is_long_mode(vcpu) && is_pae(vcpu)) {
load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
mmu_reset_needed = 1; mmu_reset_needed = 1;
} }
srcu_read_unlock(&vcpu->kvm->srcu, idx);
if (mmu_reset_needed) if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu); kvm_mmu_reset_context(vcpu);
@ -5617,8 +5629,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
if (pending_vec < max_bits) { if (pending_vec < max_bits) {
kvm_queue_interrupt(vcpu, pending_vec, false); kvm_queue_interrupt(vcpu, pending_vec, false);
pr_debug("Set back pending irq %d\n", pending_vec); pr_debug("Set back pending irq %d\n", pending_vec);
if (irqchip_in_kernel(vcpu->kvm))
kvm_pic_clear_isr_ack(vcpu->kvm);
} }
kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@ -5814,10 +5824,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{ {
if (vcpu->arch.time_page) { kvmclock_reset(vcpu);
kvm_release_page_dirty(vcpu->arch.time_page);
vcpu->arch.time_page = NULL;
}
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fx_free(vcpu); fx_free(vcpu);
@ -5878,6 +5885,8 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu);
vcpu->arch.apf.msr_val = 0; vcpu->arch.apf.msr_val = 0;
kvmclock_reset(vcpu);
kvm_clear_async_pf_completion_queue(vcpu); kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu); kvm_async_pf_hash_reset(vcpu);
vcpu->arch.apf.halted = false; vcpu->arch.apf.halted = false;
@ -6005,7 +6014,7 @@ int kvm_arch_init_vm(struct kvm *kvm)
/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
spin_lock_init(&kvm->arch.tsc_write_lock); raw_spin_lock_init(&kvm->arch.tsc_write_lock);
return 0; return 0;
} }
@ -6103,7 +6112,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
int user_alloc) int user_alloc)
{ {
int npages = mem->memory_size >> PAGE_SHIFT; int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
if (!user_alloc && !old.user_alloc && old.rmap && !npages) { if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
int ret; int ret;
@ -6118,12 +6127,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
"failed to munmap memory\n"); "failed to munmap memory\n");
} }
spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages)
if (!kvm->arch.n_requested_mmu_pages) { nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
}
spin_lock(&kvm->mmu_lock);
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
kvm_mmu_slot_remove_write_access(kvm, mem->slot); kvm_mmu_slot_remove_write_access(kvm, mem->slot);
spin_unlock(&kvm->mmu_lock); spin_unlock(&kvm->mmu_lock);
} }
@ -6157,7 +6166,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
me = get_cpu(); me = get_cpu();
if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
if (atomic_xchg(&vcpu->guest_mode, 0)) if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
smp_send_reschedule(cpu); smp_send_reschedule(cpu);
put_cpu(); put_cpu();
} }

Просмотреть файл

@ -53,8 +53,8 @@ static void __ipath_release_user_pages(struct page **p, size_t num_pages,
} }
/* call with current->mm->mmap_sem held */ /* call with current->mm->mmap_sem held */
static int __get_user_pages(unsigned long start_page, size_t num_pages, static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
struct page **p, struct vm_area_struct **vma) struct page **p, struct vm_area_struct **vma)
{ {
unsigned long lock_limit; unsigned long lock_limit;
size_t got; size_t got;
@ -165,7 +165,7 @@ int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
down_write(&current->mm->mmap_sem); down_write(&current->mm->mmap_sem);
ret = __get_user_pages(start_page, num_pages, p, NULL); ret = __ipath_get_user_pages(start_page, num_pages, p, NULL);
up_write(&current->mm->mmap_sem); up_write(&current->mm->mmap_sem);

Просмотреть файл

@ -51,8 +51,8 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages,
/* /*
* Call with current->mm->mmap_sem held. * Call with current->mm->mmap_sem held.
*/ */
static int __get_user_pages(unsigned long start_page, size_t num_pages, static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
struct page **p, struct vm_area_struct **vma) struct page **p, struct vm_area_struct **vma)
{ {
unsigned long lock_limit; unsigned long lock_limit;
size_t got; size_t got;
@ -136,7 +136,7 @@ int qib_get_user_pages(unsigned long start_page, size_t num_pages,
down_write(&current->mm->mmap_sem); down_write(&current->mm->mmap_sem);
ret = __get_user_pages(start_page, num_pages, p, NULL); ret = __qib_get_user_pages(start_page, num_pages, p, NULL);
up_write(&current->mm->mmap_sem); up_write(&current->mm->mmap_sem);

Просмотреть файл

@ -108,4 +108,6 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */ #define ERFKILL 132 /* Operation not possible due to RF-kill */
#define EHWPOISON 133 /* Memory page has hardware error */
#endif #endif

Просмотреть файл

@ -43,6 +43,7 @@
#define KVM_REQ_DEACTIVATE_FPU 10 #define KVM_REQ_DEACTIVATE_FPU 10
#define KVM_REQ_EVENT 11 #define KVM_REQ_EVENT 11
#define KVM_REQ_APF_HALT 12 #define KVM_REQ_APF_HALT 12
#define KVM_REQ_NMI 13
#define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_USERSPACE_IRQ_SOURCE_ID 0
@ -98,23 +99,31 @@ int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu); int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu);
#endif #endif
enum {
OUTSIDE_GUEST_MODE,
IN_GUEST_MODE,
EXITING_GUEST_MODE
};
struct kvm_vcpu { struct kvm_vcpu {
struct kvm *kvm; struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS #ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier preempt_notifier; struct preempt_notifier preempt_notifier;
#endif #endif
int cpu;
int vcpu_id; int vcpu_id;
struct mutex mutex; int srcu_idx;
int cpu; int mode;
atomic_t guest_mode;
struct kvm_run *run;
unsigned long requests; unsigned long requests;
unsigned long guest_debug; unsigned long guest_debug;
int srcu_idx;
struct mutex mutex;
struct kvm_run *run;
int fpu_active; int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded; int guest_fpu_loaded, guest_xcr0_loaded;
wait_queue_head_t wq; wait_queue_head_t wq;
struct pid *pid;
int sigset_active; int sigset_active;
sigset_t sigset; sigset_t sigset;
struct kvm_vcpu_stat stat; struct kvm_vcpu_stat stat;
@ -140,6 +149,11 @@ struct kvm_vcpu {
struct kvm_vcpu_arch arch; struct kvm_vcpu_arch arch;
}; };
static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu)
{
return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE);
}
/* /*
* Some of the bitops functions do not support too long bitmaps. * Some of the bitops functions do not support too long bitmaps.
* This number must be determined not to exceed such limits. * This number must be determined not to exceed such limits.
@ -212,7 +226,6 @@ struct kvm_memslots {
struct kvm { struct kvm {
spinlock_t mmu_lock; spinlock_t mmu_lock;
raw_spinlock_t requests_lock;
struct mutex slots_lock; struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */ struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots *memslots; struct kvm_memslots *memslots;
@ -223,6 +236,7 @@ struct kvm {
#endif #endif
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
atomic_t online_vcpus; atomic_t online_vcpus;
int last_boosted_vcpu;
struct list_head vm_list; struct list_head vm_list;
struct mutex lock; struct mutex lock;
struct kvm_io_bus *buses[KVM_NR_BUSES]; struct kvm_io_bus *buses[KVM_NR_BUSES];
@ -719,11 +733,6 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
set_bit(req, &vcpu->requests); set_bit(req, &vcpu->requests);
} }
static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
{
return test_and_set_bit(req, &vcpu->requests);
}
static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
{ {
if (test_bit(req, &vcpu->requests)) { if (test_bit(req, &vcpu->requests)) {

Просмотреть файл

@ -972,6 +972,10 @@ static inline int handle_mm_fault(struct mm_struct *mm,
extern int make_pages_present(unsigned long addr, unsigned long end); extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, unsigned int foll_flags,
struct page **pages, struct vm_area_struct **vmas,
int *nonblocking);
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, int write, int force, unsigned long start, int nr_pages, int write, int force,
struct page **pages, struct vm_area_struct **vmas); struct page **pages, struct vm_area_struct **vmas);
@ -1535,6 +1539,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
#define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */
#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */
#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data); void *data);
@ -1627,14 +1632,6 @@ extern int sysctl_memory_failure_recovery;
extern void shake_page(struct page *p, int access); extern void shake_page(struct page *p, int access);
extern atomic_long_t mce_bad_pages; extern atomic_long_t mce_bad_pages;
extern int soft_offline_page(struct page *page, int flags); extern int soft_offline_page(struct page *page, int flags);
#ifdef CONFIG_MEMORY_FAILURE
int is_hwpoison_address(unsigned long addr);
#else
static inline int is_hwpoison_address(unsigned long addr)
{
return 0;
}
#endif
extern void dump_page(struct page *page); extern void dump_page(struct page *page);

Просмотреть файл

@ -193,6 +193,7 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk)) if (!profile_handoff_task(tsk))
free_task(tsk); free_task(tsk);
} }
EXPORT_SYMBOL_GPL(__put_task_struct);
/* /*
* macro override instead of weak attribute alias, to workaround * macro override instead of weak attribute alias, to workaround

Просмотреть файл

@ -435,6 +435,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
rcu_read_unlock(); rcu_read_unlock();
return pid; return pid;
} }
EXPORT_SYMBOL_GPL(get_task_pid);
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{ {
@ -446,6 +447,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
rcu_read_unlock(); rcu_read_unlock();
return result; return result;
} }
EXPORT_SYMBOL_GPL(get_pid_task);
struct pid *find_get_pid(pid_t nr) struct pid *find_get_pid(pid_t nr)
{ {

Просмотреть файл

@ -245,11 +245,6 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
} }
#endif /* CONFIG_SPARSEMEM */ #endif /* CONFIG_SPARSEMEM */
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, unsigned int foll_flags,
struct page **pages, struct vm_area_struct **vmas,
int *nonblocking);
#define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_NOSCAN -2
#define ZONE_RECLAIM_FULL -1 #define ZONE_RECLAIM_FULL -1
#define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SOME 0

Просмотреть файл

@ -1487,35 +1487,3 @@ done:
/* keep elevated page count for bad page */ /* keep elevated page count for bad page */
return ret; return ret;
} }
/*
* The caller must hold current->mm->mmap_sem in read mode.
*/
int is_hwpoison_address(unsigned long addr)
{
pgd_t *pgdp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t pte, *ptep;
swp_entry_t entry;
pgdp = pgd_offset(current->mm, addr);
if (!pgd_present(*pgdp))
return 0;
pudp = pud_offset(pgdp, addr);
pud = *pudp;
if (!pud_present(pud) || pud_large(pud))
return 0;
pmdp = pmd_offset(pudp, addr);
pmd = *pmdp;
if (!pmd_present(pmd) || pmd_large(pmd))
return 0;
ptep = pte_offset_map(pmdp, addr);
pte = *ptep;
pte_unmap(ptep);
if (!is_swap_pte(pte))
return 0;
entry = pte_to_swp_entry(pte);
return is_hwpoison_entry(entry);
}
EXPORT_SYMBOL_GPL(is_hwpoison_address);

Просмотреть файл

@ -1410,6 +1410,55 @@ no_page_table:
return page; return page;
} }
/**
* __get_user_pages() - pin user pages in memory
* @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
* @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long. Or NULL, if caller
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @nonblocking: whether waiting for disk IO or mmap_sem contention
*
* Returns number of pages pinned. This may be fewer than the number
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno. Each page returned must be released
* with a put_page() call when it is finished with. vmas will only
* remain valid while mmap_sem is held.
*
* Must be called with mmap_sem held for read or write.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
* __get_user_pages returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
* contains data that was valid *at some point in time*. Typically, an IO
* or similar operation cannot guarantee anything stronger anyway because
* locks can't be held over the syscall boundary.
*
* If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
* the page is written to, set_page_dirty (or set_page_dirty_lock, as
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
* If @nonblocking != NULL, __get_user_pages will not wait for disk IO
* or mmap_sem contention, and if waiting is needed to pin all pages,
* *@nonblocking will be set to 0.
*
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, unsigned int gup_flags, unsigned long start, int nr_pages, unsigned int gup_flags,
struct page **pages, struct vm_area_struct **vmas, struct page **pages, struct vm_area_struct **vmas,
@ -1527,9 +1576,16 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM) if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM; return i ? i : -ENOMEM;
if (ret & if (ret & (VM_FAULT_HWPOISON |
(VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| VM_FAULT_HWPOISON_LARGE)) {
VM_FAULT_SIGBUS)) if (i)
return i;
else if (gup_flags & FOLL_HWPOISON)
return -EHWPOISON;
else
return -EFAULT;
}
if (ret & VM_FAULT_SIGBUS)
return i ? i : -EFAULT; return i ? i : -EFAULT;
BUG(); BUG();
} }
@ -1578,6 +1634,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
} while (nr_pages); } while (nr_pages);
return i; return i;
} }
EXPORT_SYMBOL(__get_user_pages);
/** /**
* get_user_pages() - pin user pages in memory * get_user_pages() - pin user pages in memory

Просмотреть файл

@ -313,8 +313,9 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi)
if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) {
/* /*
* This rcu_assign_pointer is needed for when * This rcu_assign_pointer is needed for when
* another thread calls kvm_irqfd_update before * another thread calls kvm_irq_routing_update before
* we flush workqueue below. * we flush workqueue below (we synchronize with
* kvm_irq_routing_update using irqfds.lock).
* It is paired with synchronize_rcu done by caller * It is paired with synchronize_rcu done by caller
* of that function. * of that function.
*/ */

Просмотреть файл

@ -69,7 +69,7 @@ MODULE_LICENSE("GPL");
* kvm->lock --> kvm->slots_lock --> kvm->irq_lock * kvm->lock --> kvm->slots_lock --> kvm->irq_lock
*/ */
DEFINE_SPINLOCK(kvm_lock); DEFINE_RAW_SPINLOCK(kvm_lock);
LIST_HEAD(vm_list); LIST_HEAD(vm_list);
static cpumask_var_t cpus_hardware_enabled; static cpumask_var_t cpus_hardware_enabled;
@ -137,6 +137,14 @@ void vcpu_load(struct kvm_vcpu *vcpu)
int cpu; int cpu;
mutex_lock(&vcpu->mutex); mutex_lock(&vcpu->mutex);
if (unlikely(vcpu->pid != current->pids[PIDTYPE_PID].pid)) {
/* The thread running this VCPU changed. */
struct pid *oldpid = vcpu->pid;
struct pid *newpid = get_task_pid(current, PIDTYPE_PID);
rcu_assign_pointer(vcpu->pid, newpid);
synchronize_rcu();
put_pid(oldpid);
}
cpu = get_cpu(); cpu = get_cpu();
preempt_notifier_register(&vcpu->preempt_notifier); preempt_notifier_register(&vcpu->preempt_notifier);
kvm_arch_vcpu_load(vcpu, cpu); kvm_arch_vcpu_load(vcpu, cpu);
@ -165,13 +173,16 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
zalloc_cpumask_var(&cpus, GFP_ATOMIC); zalloc_cpumask_var(&cpus, GFP_ATOMIC);
raw_spin_lock(&kvm->requests_lock); me = get_cpu();
me = smp_processor_id();
kvm_for_each_vcpu(i, vcpu, kvm) { kvm_for_each_vcpu(i, vcpu, kvm) {
if (kvm_make_check_request(req, vcpu)) kvm_make_request(req, vcpu);
continue;
cpu = vcpu->cpu; cpu = vcpu->cpu;
if (cpus != NULL && cpu != -1 && cpu != me)
/* Set ->requests bit before we read ->mode */
smp_mb();
if (cpus != NULL && cpu != -1 && cpu != me &&
kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
cpumask_set_cpu(cpu, cpus); cpumask_set_cpu(cpu, cpus);
} }
if (unlikely(cpus == NULL)) if (unlikely(cpus == NULL))
@ -180,7 +191,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
smp_call_function_many(cpus, ack_flush, NULL, 1); smp_call_function_many(cpus, ack_flush, NULL, 1);
else else
called = false; called = false;
raw_spin_unlock(&kvm->requests_lock); put_cpu();
free_cpumask_var(cpus); free_cpumask_var(cpus);
return called; return called;
} }
@ -209,6 +220,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->cpu = -1; vcpu->cpu = -1;
vcpu->kvm = kvm; vcpu->kvm = kvm;
vcpu->vcpu_id = id; vcpu->vcpu_id = id;
vcpu->pid = NULL;
init_waitqueue_head(&vcpu->wq); init_waitqueue_head(&vcpu->wq);
kvm_async_pf_vcpu_init(vcpu); kvm_async_pf_vcpu_init(vcpu);
@ -233,6 +245,7 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_init);
void kvm_vcpu_uninit(struct kvm_vcpu *vcpu) void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{ {
put_pid(vcpu->pid);
kvm_arch_vcpu_uninit(vcpu); kvm_arch_vcpu_uninit(vcpu);
free_page((unsigned long)vcpu->run); free_page((unsigned long)vcpu->run);
} }
@ -463,15 +476,14 @@ static struct kvm *kvm_create_vm(void)
kvm->mm = current->mm; kvm->mm = current->mm;
atomic_inc(&kvm->mm->mm_count); atomic_inc(&kvm->mm->mm_count);
spin_lock_init(&kvm->mmu_lock); spin_lock_init(&kvm->mmu_lock);
raw_spin_lock_init(&kvm->requests_lock);
kvm_eventfd_init(kvm); kvm_eventfd_init(kvm);
mutex_init(&kvm->lock); mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock); mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_lock);
atomic_set(&kvm->users_count, 1); atomic_set(&kvm->users_count, 1);
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
list_add(&kvm->vm_list, &vm_list); list_add(&kvm->vm_list, &vm_list);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
return kvm; return kvm;
@ -544,9 +556,9 @@ static void kvm_destroy_vm(struct kvm *kvm)
struct mm_struct *mm = kvm->mm; struct mm_struct *mm = kvm->mm;
kvm_arch_sync_events(kvm); kvm_arch_sync_events(kvm);
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
list_del(&kvm->vm_list); list_del(&kvm->vm_list);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
kvm_free_irq_routing(kvm); kvm_free_irq_routing(kvm);
for (i = 0; i < KVM_NR_BUSES; i++) for (i = 0; i < KVM_NR_BUSES; i++)
kvm_io_bus_destroy(kvm->buses[i]); kvm_io_bus_destroy(kvm->buses[i]);
@ -588,6 +600,7 @@ static int kvm_vm_release(struct inode *inode, struct file *filp)
return 0; return 0;
} }
#ifndef CONFIG_S390
/* /*
* Allocation size is twice as large as the actual dirty bitmap size. * Allocation size is twice as large as the actual dirty bitmap size.
* This makes it possible to do double buffering: see x86's * This makes it possible to do double buffering: see x86's
@ -608,6 +621,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
memslot->dirty_bitmap_head = memslot->dirty_bitmap; memslot->dirty_bitmap_head = memslot->dirty_bitmap;
return 0; return 0;
} }
#endif /* !CONFIG_S390 */
/* /*
* Allocate some memory and give it an address in the guest physical address * Allocate some memory and give it an address in the guest physical address
@ -621,7 +635,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem, struct kvm_userspace_memory_region *mem,
int user_alloc) int user_alloc)
{ {
int r, flush_shadow = 0; int r;
gfn_t base_gfn; gfn_t base_gfn;
unsigned long npages; unsigned long npages;
unsigned long i; unsigned long i;
@ -741,8 +755,6 @@ skip_lpage:
if (kvm_create_dirty_bitmap(&new) < 0) if (kvm_create_dirty_bitmap(&new) < 0)
goto out_free; goto out_free;
/* destroy any largepage mappings for dirty tracking */ /* destroy any largepage mappings for dirty tracking */
if (old.npages)
flush_shadow = 1;
} }
#else /* not defined CONFIG_S390 */ #else /* not defined CONFIG_S390 */
new.user_alloc = user_alloc; new.user_alloc = user_alloc;
@ -813,9 +825,6 @@ skip_lpage:
kvm_free_physmem_slot(&old, &new); kvm_free_physmem_slot(&old, &new);
kfree(old_memslots); kfree(old_memslots);
if (flush_shadow)
kvm_arch_flush_shadow(kvm);
return 0; return 0;
out_free: out_free:
@ -1029,6 +1038,15 @@ static pfn_t get_fault_pfn(void)
return fault_pfn; return fault_pfn;
} }
static inline int check_user_page_hwpoison(unsigned long addr)
{
int rc, flags = FOLL_TOUCH | FOLL_HWPOISON | FOLL_WRITE;
rc = __get_user_pages(current, current->mm, addr, 1,
flags, NULL, NULL, NULL);
return rc == -EHWPOISON;
}
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic, static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
bool *async, bool write_fault, bool *writable) bool *async, bool write_fault, bool *writable)
{ {
@ -1076,7 +1094,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic,
return get_fault_pfn(); return get_fault_pfn();
down_read(&current->mm->mmap_sem); down_read(&current->mm->mmap_sem);
if (is_hwpoison_address(addr)) { if (check_user_page_hwpoison(addr)) {
up_read(&current->mm->mmap_sem); up_read(&current->mm->mmap_sem);
get_page(hwpoison_page); get_page(hwpoison_page);
return page_to_pfn(hwpoison_page); return page_to_pfn(hwpoison_page);
@ -1466,18 +1484,55 @@ void kvm_resched(struct kvm_vcpu *vcpu)
} }
EXPORT_SYMBOL_GPL(kvm_resched); EXPORT_SYMBOL_GPL(kvm_resched);
void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu) void kvm_vcpu_on_spin(struct kvm_vcpu *me)
{ {
ktime_t expires; struct kvm *kvm = me->kvm;
DEFINE_WAIT(wait); struct kvm_vcpu *vcpu;
int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
int yielded = 0;
int pass;
int i;
prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); /*
* We boost the priority of a VCPU that is runnable but not
/* Sleep for 100 us, and hope lock-holder got scheduled */ * currently running, because it got preempted by something
expires = ktime_add_ns(ktime_get(), 100000UL); * else and called schedule in __vcpu_run. Hopefully that
schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); * VCPU is holding the lock that we need and will release it.
* We approximate round-robin by starting at the last boosted VCPU.
finish_wait(&vcpu->wq, &wait); */
for (pass = 0; pass < 2 && !yielded; pass++) {
kvm_for_each_vcpu(i, vcpu, kvm) {
struct task_struct *task = NULL;
struct pid *pid;
if (!pass && i < last_boosted_vcpu) {
i = last_boosted_vcpu;
continue;
} else if (pass && i > last_boosted_vcpu)
break;
if (vcpu == me)
continue;
if (waitqueue_active(&vcpu->wq))
continue;
rcu_read_lock();
pid = rcu_dereference(vcpu->pid);
if (pid)
task = get_pid_task(vcpu->pid, PIDTYPE_PID);
rcu_read_unlock();
if (!task)
continue;
if (task->flags & PF_VCPU) {
put_task_struct(task);
continue;
}
if (yield_to(task, 1)) {
put_task_struct(task);
kvm->last_boosted_vcpu = i;
yielded = 1;
break;
}
put_task_struct(task);
}
}
} }
EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin); EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
@ -2122,9 +2177,9 @@ static void hardware_enable_nolock(void *junk)
static void hardware_enable(void *junk) static void hardware_enable(void *junk)
{ {
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
hardware_enable_nolock(junk); hardware_enable_nolock(junk);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
} }
static void hardware_disable_nolock(void *junk) static void hardware_disable_nolock(void *junk)
@ -2139,9 +2194,9 @@ static void hardware_disable_nolock(void *junk)
static void hardware_disable(void *junk) static void hardware_disable(void *junk)
{ {
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
hardware_disable_nolock(junk); hardware_disable_nolock(junk);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
} }
static void hardware_disable_all_nolock(void) static void hardware_disable_all_nolock(void)
@ -2155,16 +2210,16 @@ static void hardware_disable_all_nolock(void)
static void hardware_disable_all(void) static void hardware_disable_all(void)
{ {
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
hardware_disable_all_nolock(); hardware_disable_all_nolock();
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
} }
static int hardware_enable_all(void) static int hardware_enable_all(void)
{ {
int r = 0; int r = 0;
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
kvm_usage_count++; kvm_usage_count++;
if (kvm_usage_count == 1) { if (kvm_usage_count == 1) {
@ -2177,7 +2232,7 @@ static int hardware_enable_all(void)
} }
} }
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
return r; return r;
} }
@ -2339,10 +2394,10 @@ static int vm_stat_get(void *_offset, u64 *val)
struct kvm *kvm; struct kvm *kvm;
*val = 0; *val = 0;
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) list_for_each_entry(kvm, &vm_list, vm_list)
*val += *(u32 *)((void *)kvm + offset); *val += *(u32 *)((void *)kvm + offset);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
return 0; return 0;
} }
@ -2356,12 +2411,12 @@ static int vcpu_stat_get(void *_offset, u64 *val)
int i; int i;
*val = 0; *val = 0;
spin_lock(&kvm_lock); raw_spin_lock(&kvm_lock);
list_for_each_entry(kvm, &vm_list, vm_list) list_for_each_entry(kvm, &vm_list, vm_list)
kvm_for_each_vcpu(i, vcpu, kvm) kvm_for_each_vcpu(i, vcpu, kvm)
*val += *(u32 *)((void *)vcpu + offset); *val += *(u32 *)((void *)vcpu + offset);
spin_unlock(&kvm_lock); raw_spin_unlock(&kvm_lock);
return 0; return 0;
} }
@ -2402,7 +2457,7 @@ static int kvm_suspend(struct sys_device *dev, pm_message_t state)
static int kvm_resume(struct sys_device *dev) static int kvm_resume(struct sys_device *dev)
{ {
if (kvm_usage_count) { if (kvm_usage_count) {
WARN_ON(spin_is_locked(&kvm_lock)); WARN_ON(raw_spin_is_locked(&kvm_lock));
hardware_enable_nolock(NULL); hardware_enable_nolock(NULL);
} }
return 0; return 0;