diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c index af2d288c881..07ae280e8fe 100644 --- a/drivers/kvm/kvm_main.c +++ b/drivers/kvm/kvm_main.c @@ -198,21 +198,15 @@ static void vcpu_put(struct kvm_vcpu *vcpu) static void ack_flush(void *_completed) { - atomic_t *completed = _completed; - - atomic_inc(completed); } void kvm_flush_remote_tlbs(struct kvm *kvm) { - int i, cpu, needed; + int i, cpu; cpumask_t cpus; struct kvm_vcpu *vcpu; - atomic_t completed; - atomic_set(&completed, 0); cpus_clear(cpus); - needed = 0; for (i = 0; i < KVM_MAX_VCPUS; ++i) { vcpu = kvm->vcpus[i]; if (!vcpu) @@ -221,23 +215,9 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) continue; cpu = vcpu->cpu; if (cpu != -1 && cpu != raw_smp_processor_id()) - if (!cpu_isset(cpu, cpus)) { - cpu_set(cpu, cpus); - ++needed; - } - } - - /* - * We really want smp_call_function_mask() here. But that's not - * available, so ipi all cpus in parallel and wait for them - * to complete. - */ - for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus)) - smp_call_function_single(cpu, ack_flush, &completed, 1, 0); - while (atomic_read(&completed) != needed) { - cpu_relax(); - barrier(); + cpu_set(cpu, cpus); } + smp_call_function_mask(cpus, ack_flush, NULL, 1); } int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) @@ -2054,12 +2034,21 @@ again: kvm_x86_ops->run(vcpu, kvm_run); - kvm_guest_exit(); vcpu->guest_mode = 0; local_irq_enable(); ++vcpu->stat.exits; + /* + * We must have an instruction between local_irq_enable() and + * kvm_guest_exit(), so the timer interrupt isn't delayed by + * the interrupt shadow. The stat.exits increment will do nicely. + * But we need to prevent reordering, hence this barrier(): + */ + barrier(); + + kvm_guest_exit(); + preempt_enable(); /* diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c index a190587cf6a..238fcad3cec 100644 --- a/drivers/kvm/lapic.c +++ b/drivers/kvm/lapic.c @@ -494,12 +494,19 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - u32 counter_passed; - ktime_t passed, now = apic->timer.dev.base->get_time(); - u32 tmcct = apic_get_reg(apic, APIC_TMICT); + u64 counter_passed; + ktime_t passed, now; + u32 tmcct; ASSERT(apic != NULL); + now = apic->timer.dev.base->get_time(); + tmcct = apic_get_reg(apic, APIC_TMICT); + + /* if initial count is 0, current count should also be 0 */ + if (tmcct == 0) + return 0; + if (unlikely(ktime_to_ns(now) <= ktime_to_ns(apic->timer.last_update))) { /* Wrap around */ @@ -514,15 +521,24 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) counter_passed = div64_64(ktime_to_ns(passed), (APIC_BUS_CYCLE_NS * apic->timer.divide_count)); - tmcct -= counter_passed; - if (tmcct <= 0) { - if (unlikely(!apic_lvtt_period(apic))) + if (counter_passed > tmcct) { + if (unlikely(!apic_lvtt_period(apic))) { + /* one-shot timers stick at 0 until reset */ tmcct = 0; - else - do { - tmcct += apic_get_reg(apic, APIC_TMICT); - } while (tmcct <= 0); + } else { + /* + * periodic timers reset to APIC_TMICT when they + * hit 0. The while loop simulates this happening N + * times. (counter_passed %= tmcct) would also work, + * but might be slower or not work on 32-bit?? + */ + while (counter_passed > tmcct) + counter_passed -= tmcct; + tmcct -= counter_passed; + } + } else { + tmcct -= counter_passed; } return tmcct; @@ -853,7 +869,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) apic_set_reg(apic, APIC_ISR + 0x10 * i, 0); apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } - apic->timer.divide_count = 0; + update_divide_count(apic); atomic_set(&apic->timer.pending, 0); if (vcpu->vcpu_id == 0) vcpu->apic_base |= MSR_IA32_APICBASE_BSP; diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c index 6d84d30f5ed..feb5ac986c5 100644 --- a/drivers/kvm/mmu.c +++ b/drivers/kvm/mmu.c @@ -1049,6 +1049,7 @@ int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) destroy_kvm_mmu(vcpu); return init_kvm_mmu(vcpu); } +EXPORT_SYMBOL_GPL(kvm_mmu_reset_context); int kvm_mmu_load(struct kvm_vcpu *vcpu) { @@ -1088,7 +1089,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, mmu_page_remove_parent_pte(child, spte); } } - *spte = 0; + set_shadow_pte(spte, 0); kvm_flush_remote_tlbs(vcpu->kvm); } diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c index 4f115a8e45e..bb56ae3f89b 100644 --- a/drivers/kvm/vmx.c +++ b/drivers/kvm/vmx.c @@ -523,6 +523,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + if (vcpu->rmode.active) + rflags |= IOPL_MASK | X86_EFLAGS_VM; vmcs_writel(GUEST_RFLAGS, rflags); } @@ -1128,6 +1130,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs); + kvm_mmu_reset_context(vcpu); init_rmode_tss(vcpu->kvm); } @@ -1760,10 +1763,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary); } - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ - asm ("int $2"); - return 1; - } + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + return 1; /* already handled by vmx_vcpu_run() */ if (is_no_device(intr_info)) { vmx_fpu_activate(vcpu); @@ -2196,6 +2197,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 intr_info; /* * Loading guest fpu may have cleared host cr0.ts @@ -2322,6 +2324,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; + + intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + /* We need to handle NMIs before interrupts are enabled */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */ + asm("int $2"); } static void vmx_inject_page_fault(struct kvm_vcpu *vcpu, diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c index 9737c3b2f48..a6ace302e0c 100644 --- a/drivers/kvm/x86_emulate.c +++ b/drivers/kvm/x86_emulate.c @@ -212,7 +212,8 @@ static u16 twobyte_table[256] = { 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xEF */ @@ -596,11 +597,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) case 0xf0: /* LOCK */ lock_prefix = 1; break; + case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP/REPE/REPZ */ rep_prefix = 1; break; - case 0xf2: /* REPNE/REPNZ */ - break; default: goto done_prefixes; } @@ -825,6 +825,14 @@ done_prefixes: if (twobyte && b == 0x01 && modrm_reg == 7) break; srcmem_common: + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((d & ModRM) && modrm_mod == 3) { + src.type = OP_REG; + break; + } src.type = OP_MEM; src.ptr = (unsigned long *)cr2; src.val = 0; @@ -893,6 +901,14 @@ done_prefixes: dst.ptr = (unsigned long *)cr2; dst.bytes = (d & ByteOp) ? 1 : op_bytes; dst.val = 0; + /* + * For instructions with a ModR/M byte, switch to register + * access if Mod = 3. + */ + if ((d & ModRM) && modrm_mod == 3) { + dst.type = OP_REG; + break; + } if (d & BitOp) { unsigned long mask = ~(dst.bytes * 8 - 1); @@ -1083,31 +1099,6 @@ push: case 0xd2 ... 0xd3: /* Grp2 */ src.val = _regs[VCPU_REGS_RCX]; goto grp2; - case 0xe8: /* call (near) */ { - long int rel; - switch (op_bytes) { - case 2: - rel = insn_fetch(s16, 2, _eip); - break; - case 4: - rel = insn_fetch(s32, 4, _eip); - break; - case 8: - rel = insn_fetch(s64, 8, _eip); - break; - default: - DPRINTF("Call: Invalid op_bytes\n"); - goto cannot_emulate; - } - src.val = (unsigned long) _eip; - JMP_REL(rel); - goto push; - } - case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ - JMP_REL(src.val); - no_wb = 1; /* Disable writeback. */ - break; case 0xf6 ... 0xf7: /* Grp3 */ switch (modrm_reg) { case 0 ... 1: /* test */ @@ -1350,6 +1341,32 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; + case 0xe8: /* call (near) */ { + long int rel; + switch (op_bytes) { + case 2: + rel = insn_fetch(s16, 2, _eip); + break; + case 4: + rel = insn_fetch(s32, 4, _eip); + break; + case 8: + rel = insn_fetch(s64, 8, _eip); + break; + default: + DPRINTF("Call: Invalid op_bytes\n"); + goto cannot_emulate; + } + src.val = (unsigned long) _eip; + JMP_REL(rel); + goto push; + } + case 0xe9: /* jmp rel */ + case 0xeb: /* jmp rel short */ + JMP_REL(src.val); + no_wb = 1; /* Disable writeback. */ + break; + } goto writeback; @@ -1501,6 +1518,10 @@ twobyte_insn: dst.bytes = op_bytes; dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val; break; + case 0xc3: /* movnti */ + dst.bytes = op_bytes; + dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val; + break; } goto writeback; diff --git a/kernel/sched.c b/kernel/sched.c index 7581e331b13..2810e562a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3375,7 +3375,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, if (p->flags & PF_VCPU) { account_guest_time(p, cputime); - p->flags &= ~PF_VCPU; return; }