diff --git a/drivers/kvm/kvm_main.c b/drivers/kvm/kvm_main.c
index af2d288c881..07ae280e8fe 100644
--- a/drivers/kvm/kvm_main.c
+++ b/drivers/kvm/kvm_main.c
@@ -198,21 +198,15 @@ static void vcpu_put(struct kvm_vcpu *vcpu)
 
 static void ack_flush(void *_completed)
 {
-	atomic_t *completed = _completed;
-
-	atomic_inc(completed);
 }
 
 void kvm_flush_remote_tlbs(struct kvm *kvm)
 {
-	int i, cpu, needed;
+	int i, cpu;
 	cpumask_t cpus;
 	struct kvm_vcpu *vcpu;
-	atomic_t completed;
 
-	atomic_set(&completed, 0);
 	cpus_clear(cpus);
-	needed = 0;
 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		vcpu = kvm->vcpus[i];
 		if (!vcpu)
@@ -221,23 +215,9 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 			continue;
 		cpu = vcpu->cpu;
 		if (cpu != -1 && cpu != raw_smp_processor_id())
-			if (!cpu_isset(cpu, cpus)) {
-				cpu_set(cpu, cpus);
-				++needed;
-			}
-	}
-
-	/*
-	 * We really want smp_call_function_mask() here.  But that's not
-	 * available, so ipi all cpus in parallel and wait for them
-	 * to complete.
-	 */
-	for (cpu = first_cpu(cpus); cpu != NR_CPUS; cpu = next_cpu(cpu, cpus))
-		smp_call_function_single(cpu, ack_flush, &completed, 1, 0);
-	while (atomic_read(&completed) != needed) {
-		cpu_relax();
-		barrier();
+			cpu_set(cpu, cpus);
 	}
+	smp_call_function_mask(cpus, ack_flush, NULL, 1);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
@@ -2054,12 +2034,21 @@ again:
 
 	kvm_x86_ops->run(vcpu, kvm_run);
 
-	kvm_guest_exit();
 	vcpu->guest_mode = 0;
 	local_irq_enable();
 
 	++vcpu->stat.exits;
 
+	/*
+	 * We must have an instruction between local_irq_enable() and
+	 * kvm_guest_exit(), so the timer interrupt isn't delayed by
+	 * the interrupt shadow.  The stat.exits increment will do nicely.
+	 * But we need to prevent reordering, hence this barrier():
+	 */
+	barrier();
+
+	kvm_guest_exit();
+
 	preempt_enable();
 
 	/*
diff --git a/drivers/kvm/lapic.c b/drivers/kvm/lapic.c
index a190587cf6a..238fcad3cec 100644
--- a/drivers/kvm/lapic.c
+++ b/drivers/kvm/lapic.c
@@ -494,12 +494,19 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
 {
-	u32 counter_passed;
-	ktime_t passed, now = apic->timer.dev.base->get_time();
-	u32 tmcct = apic_get_reg(apic, APIC_TMICT);
+	u64 counter_passed;
+	ktime_t passed, now;
+	u32 tmcct;
 
 	ASSERT(apic != NULL);
 
+	now = apic->timer.dev.base->get_time();
+	tmcct = apic_get_reg(apic, APIC_TMICT);
+
+	/* if initial count is 0, current count should also be 0 */
+	if (tmcct == 0)
+		return 0;
+
 	if (unlikely(ktime_to_ns(now) <=
 		ktime_to_ns(apic->timer.last_update))) {
 		/* Wrap around */
@@ -514,15 +521,24 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 
 	counter_passed = div64_64(ktime_to_ns(passed),
 				  (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
-	tmcct -= counter_passed;
 
-	if (tmcct <= 0) {
-		if (unlikely(!apic_lvtt_period(apic)))
+	if (counter_passed > tmcct) {
+		if (unlikely(!apic_lvtt_period(apic))) {
+			/* one-shot timers stick at 0 until reset */
 			tmcct = 0;
-		else
-			do {
-				tmcct += apic_get_reg(apic, APIC_TMICT);
-			} while (tmcct <= 0);
+		} else {
+			/*
+			 * periodic timers reset to APIC_TMICT when they
+			 * hit 0. The while loop simulates this happening N
+			 * times. (counter_passed %= tmcct) would also work,
+			 * but might be slower or not work on 32-bit??
+			 */
+			while (counter_passed > tmcct)
+				counter_passed -= tmcct;
+			tmcct -= counter_passed;
+		}
+	} else {
+		tmcct -= counter_passed;
 	}
 
 	return tmcct;
@@ -853,7 +869,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->timer.divide_count = 0;
+	update_divide_count(apic);
 	atomic_set(&apic->timer.pending, 0);
 	if (vcpu->vcpu_id == 0)
 		vcpu->apic_base |= MSR_IA32_APICBASE_BSP;
diff --git a/drivers/kvm/mmu.c b/drivers/kvm/mmu.c
index 6d84d30f5ed..feb5ac986c5 100644
--- a/drivers/kvm/mmu.c
+++ b/drivers/kvm/mmu.c
@@ -1049,6 +1049,7 @@ int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 	destroy_kvm_mmu(vcpu);
 	return init_kvm_mmu(vcpu);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
 
 int kvm_mmu_load(struct kvm_vcpu *vcpu)
 {
@@ -1088,7 +1089,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 			mmu_page_remove_parent_pte(child, spte);
 		}
 	}
-	*spte = 0;
+	set_shadow_pte(spte, 0);
 	kvm_flush_remote_tlbs(vcpu->kvm);
 }
 
diff --git a/drivers/kvm/vmx.c b/drivers/kvm/vmx.c
index 4f115a8e45e..bb56ae3f89b 100644
--- a/drivers/kvm/vmx.c
+++ b/drivers/kvm/vmx.c
@@ -523,6 +523,8 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
+	if (vcpu->rmode.active)
+		rflags |= IOPL_MASK | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
@@ -1128,6 +1130,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	fix_rmode_seg(VCPU_SREG_GS, &vcpu->rmode.gs);
 	fix_rmode_seg(VCPU_SREG_FS, &vcpu->rmode.fs);
 
+	kvm_mmu_reset_context(vcpu);
 	init_rmode_tss(vcpu->kvm);
 }
 
@@ -1760,10 +1763,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		set_bit(irq / BITS_PER_LONG, &vcpu->irq_summary);
 	}
 
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */
-		asm ("int $2");
-		return 1;
-	}
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_no_device(intr_info)) {
 		vmx_fpu_activate(vcpu);
@@ -2196,6 +2197,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 intr_info;
 
 	/*
 	 * Loading guest fpu may have cleared host cr0.ts
@@ -2322,6 +2324,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	asm ("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
+
+	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/* We need to handle NMIs before interrupts are enabled */
+	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) /* nmi */
+		asm("int $2");
 }
 
 static void vmx_inject_page_fault(struct kvm_vcpu *vcpu,
diff --git a/drivers/kvm/x86_emulate.c b/drivers/kvm/x86_emulate.c
index 9737c3b2f48..a6ace302e0c 100644
--- a/drivers/kvm/x86_emulate.c
+++ b/drivers/kvm/x86_emulate.c
@@ -212,7 +212,8 @@ static u16 twobyte_table[256] = {
 	0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
 	    DstReg | SrcMem16 | ModRM | Mov,
 	/* 0xC0 - 0xCF */
-	0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM,
+	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xD0 - 0xDF */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xEF */
@@ -596,11 +597,10 @@ x86_emulate_memop(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		case 0xf0:	/* LOCK */
 			lock_prefix = 1;
 			break;
+		case 0xf2:	/* REPNE/REPNZ */
 		case 0xf3:	/* REP/REPE/REPZ */
 			rep_prefix = 1;
 			break;
-		case 0xf2:	/* REPNE/REPNZ */
-			break;
 		default:
 			goto done_prefixes;
 		}
@@ -825,6 +825,14 @@ done_prefixes:
 		if (twobyte && b == 0x01 && modrm_reg == 7)
 			break;
 	      srcmem_common:
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((d & ModRM) && modrm_mod == 3) {
+			src.type = OP_REG;
+			break;
+		}
 		src.type = OP_MEM;
 		src.ptr = (unsigned long *)cr2;
 		src.val = 0;
@@ -893,6 +901,14 @@ done_prefixes:
 		dst.ptr = (unsigned long *)cr2;
 		dst.bytes = (d & ByteOp) ? 1 : op_bytes;
 		dst.val = 0;
+		/*
+		 * For instructions with a ModR/M byte, switch to register
+		 * access if Mod = 3.
+		 */
+		if ((d & ModRM) && modrm_mod == 3) {
+			dst.type = OP_REG;
+			break;
+		}
 		if (d & BitOp) {
 			unsigned long mask = ~(dst.bytes * 8 - 1);
 
@@ -1083,31 +1099,6 @@ push:
 	case 0xd2 ... 0xd3:	/* Grp2 */
 		src.val = _regs[VCPU_REGS_RCX];
 		goto grp2;
-	case 0xe8: /* call (near) */ {
-		long int rel;
-		switch (op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, _eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, _eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, _eip);
-			break;
-		default:
-			DPRINTF("Call: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		src.val = (unsigned long) _eip;
-		JMP_REL(rel);
-		goto push;
-	}
-	case 0xe9: /* jmp rel */
-	case 0xeb: /* jmp rel short */
-		JMP_REL(src.val);
-		no_wb = 1; /* Disable writeback. */
-		break;
 	case 0xf6 ... 0xf7:	/* Grp3 */
 		switch (modrm_reg) {
 		case 0 ... 1:	/* test */
@@ -1350,6 +1341,32 @@ special_insn:
 	case 0xae ... 0xaf:	/* scas */
 		DPRINTF("Urk! I don't handle SCAS.\n");
 		goto cannot_emulate;
+	case 0xe8: /* call (near) */ {
+		long int rel;
+		switch (op_bytes) {
+		case 2:
+			rel = insn_fetch(s16, 2, _eip);
+			break;
+		case 4:
+			rel = insn_fetch(s32, 4, _eip);
+			break;
+		case 8:
+			rel = insn_fetch(s64, 8, _eip);
+			break;
+		default:
+			DPRINTF("Call: Invalid op_bytes\n");
+			goto cannot_emulate;
+		}
+		src.val = (unsigned long) _eip;
+		JMP_REL(rel);
+		goto push;
+	}
+	case 0xe9: /* jmp rel */
+	case 0xeb: /* jmp rel short */
+		JMP_REL(src.val);
+		no_wb = 1; /* Disable writeback. */
+		break;
+
 
 	}
 	goto writeback;
@@ -1501,6 +1518,10 @@ twobyte_insn:
 		dst.bytes = op_bytes;
 		dst.val = (d & ByteOp) ? (s8) src.val : (s16) src.val;
 		break;
+	case 0xc3:		/* movnti */
+		dst.bytes = op_bytes;
+		dst.val = (op_bytes == 4) ? (u32) src.val : (u64) src.val;
+		break;
 	}
 	goto writeback;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 7581e331b13..2810e562a99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3375,7 +3375,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 
 	if (p->flags & PF_VCPU) {
 		account_guest_time(p, cputime);
-		p->flags &= ~PF_VCPU;
 		return;
 	}