diff --git a/.gitignore b/.gitignore index c72955aad..81b151029 100644 --- a/.gitignore +++ b/.gitignore @@ -77,6 +77,7 @@ pc-bios/vgabios-pq/status pc-bios/optionrom/linuxboot.bin pc-bios/optionrom/multiboot.bin pc-bios/optionrom/multiboot.raw +pc-bios/optionrom/kvmvapic.bin .stgit-* cscope.* tags diff --git a/Makefile b/Makefile index d49b84e0e..49c775b82 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,7 @@ pxe-e1000.rom pxe-eepro100.rom pxe-ne2k_pci.rom \ pxe-pcnet.rom pxe-rtl8139.rom pxe-virtio.rom \ bamboo.dtb petalogix-s3adsp1800.dtb petalogix-ml605.dtb \ mpc8544ds.dtb \ -multiboot.bin linuxboot.bin \ +multiboot.bin linuxboot.bin kvmvapic.bin \ s390-zipl.rom \ spapr-rtas.bin slof.bin \ palcode-clipper diff --git a/Makefile.target b/Makefile.target index 68a564118..692c9d78a 100644 --- a/Makefile.target +++ b/Makefile.target @@ -237,7 +237,8 @@ obj-y += device-hotplug.o # Hardware support obj-i386-y += mc146818rtc.o pc.o -obj-i386-y += sga.o apic_common.o apic.o ioapic_common.o ioapic.o piix_pci.o +obj-i386-y += apic_common.o apic.o kvmvapic.o +obj-i386-y += sga.o ioapic_common.o ioapic.o piix_pci.o obj-i386-y += vmport.o obj-i386-y += pci-hotplug.o smbios.o wdt_ib700.o obj-i386-y += debugcon.o multiboot.o diff --git a/cpu-all.h b/cpu-all.h index e2c3c4928..80e6d4234 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -375,8 +375,9 @@ DECLARE_TLS(CPUState *,cpu_single_env); #define CPU_INTERRUPT_TGT_INT_0 0x0100 #define CPU_INTERRUPT_TGT_INT_1 0x0400 #define CPU_INTERRUPT_TGT_INT_2 0x0800 +#define CPU_INTERRUPT_TGT_INT_3 0x2000 -/* First unused bit: 0x2000. */ +/* First unused bit: 0x4000. */ /* The set of all bits that should be masked when single-stepping. */ #define CPU_INTERRUPT_SSTEP_MASK \ diff --git a/cpus.c b/cpus.c index f45a438b2..17b055fba 100644 --- a/cpus.c +++ b/cpus.c @@ -714,6 +714,7 @@ static void *qemu_kvm_cpu_thread_fn(void *arg) qemu_mutex_lock(&qemu_global_mutex); qemu_thread_get_self(env->thread); env->thread_id = qemu_get_thread_id(); + cpu_single_env = env; r = kvm_init_vcpu(env); if (r < 0) { @@ -760,6 +761,11 @@ static void *qemu_tcg_cpu_thread_fn(void *arg) /* wait for initial kick-off after machine start */ while (first_cpu->stopped) { qemu_cond_wait(tcg_halt_cond, &qemu_global_mutex); + + /* process any pending work */ + for (env = first_cpu; env != NULL; env = env->next_cpu) { + qemu_wait_io_event_common(env); + } } while (1) { @@ -852,7 +858,7 @@ static int all_vcpus_paused(void) if (!penv->stopped) { return 0; } - penv = (CPUState *)penv->next_cpu; + penv = penv->next_cpu; } return 1; @@ -866,7 +872,19 @@ void pause_all_vcpus(void) while (penv) { penv->stop = 1; qemu_cpu_kick(penv); - penv = (CPUState *)penv->next_cpu; + penv = penv->next_cpu; + } + + if (!qemu_thread_is_self(&io_thread)) { + cpu_stop_current(); + if (!kvm_enabled()) { + while (penv) { + penv->stop = 0; + penv->stopped = 1; + penv = penv->next_cpu; + } + return; + } } while (!all_vcpus_paused()) { @@ -874,7 +892,7 @@ void pause_all_vcpus(void) penv = first_cpu; while (penv) { qemu_cpu_kick(penv); - penv = (CPUState *)penv->next_cpu; + penv = penv->next_cpu; } } } @@ -888,7 +906,7 @@ void resume_all_vcpus(void) penv->stop = 0; penv->stopped = 0; qemu_cpu_kick(penv); - penv = (CPUState *)penv->next_cpu; + penv = penv->next_cpu; } } diff --git a/hw/apic.c b/hw/apic.c index ff9d24e91..4eeaf8801 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -35,6 +35,10 @@ #define MSI_ADDR_DEST_ID_SHIFT 12 #define MSI_ADDR_DEST_ID_MASK 0x00ffff0 +#define SYNC_FROM_VAPIC 0x1 +#define SYNC_TO_VAPIC 0x2 +#define SYNC_ISR_IRR_TO_VAPIC 0x4 + static APICCommonState *local_apics[MAX_APICS + 1]; static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode); @@ -78,6 +82,70 @@ static inline int get_bit(uint32_t *tab, int index) return !!(tab[i] & mask); } +/* return -1 if no bit is set */ +static int get_highest_priority_int(uint32_t *tab) +{ + int i; + for (i = 7; i >= 0; i--) { + if (tab[i] != 0) { + return i * 32 + fls_bit(tab[i]); + } + } + return -1; +} + +static void apic_sync_vapic(APICCommonState *s, int sync_type) +{ + VAPICState vapic_state; + size_t length; + off_t start; + int vector; + + if (!s->vapic_paddr) { + return; + } + if (sync_type & SYNC_FROM_VAPIC) { + cpu_physical_memory_rw(s->vapic_paddr, (void *)&vapic_state, + sizeof(vapic_state), 0); + s->tpr = vapic_state.tpr; + } + if (sync_type & (SYNC_TO_VAPIC | SYNC_ISR_IRR_TO_VAPIC)) { + start = offsetof(VAPICState, isr); + length = offsetof(VAPICState, enabled) - offsetof(VAPICState, isr); + + if (sync_type & SYNC_TO_VAPIC) { + assert(qemu_cpu_is_self(s->cpu_env)); + + vapic_state.tpr = s->tpr; + vapic_state.enabled = 1; + start = 0; + length = sizeof(VAPICState); + } + + vector = get_highest_priority_int(s->isr); + if (vector < 0) { + vector = 0; + } + vapic_state.isr = vector & 0xf0; + + vapic_state.zero = 0; + + vector = get_highest_priority_int(s->irr); + if (vector < 0) { + vector = 0; + } + vapic_state.irr = vector & 0xff; + + cpu_physical_memory_write_rom(s->vapic_paddr + start, + ((void *)&vapic_state) + start, length); + } +} + +static void apic_vapic_base_update(APICCommonState *s) +{ + apic_sync_vapic(s, SYNC_TO_VAPIC); +} + static void apic_local_deliver(APICCommonState *s, int vector) { uint32_t lvt = s->lvt[vector]; @@ -239,20 +307,17 @@ static void apic_set_base(APICCommonState *s, uint64_t val) static void apic_set_tpr(APICCommonState *s, uint8_t val) { - s->tpr = (val & 0x0f) << 4; - apic_update_irq(s); + /* Updates from cr8 are ignored while the VAPIC is active */ + if (!s->vapic_paddr) { + s->tpr = val << 4; + apic_update_irq(s); + } } -/* return -1 if no bit is set */ -static int get_highest_priority_int(uint32_t *tab) +static uint8_t apic_get_tpr(APICCommonState *s) { - int i; - for(i = 7; i >= 0; i--) { - if (tab[i] != 0) { - return i * 32 + fls_bit(tab[i]); - } - } - return -1; + apic_sync_vapic(s, SYNC_FROM_VAPIC); + return s->tpr >> 4; } static int apic_get_ppr(APICCommonState *s) @@ -312,6 +377,14 @@ static void apic_update_irq(APICCommonState *s) } } +void apic_poll_irq(DeviceState *d) +{ + APICCommonState *s = APIC_COMMON(d); + + apic_sync_vapic(s, SYNC_FROM_VAPIC); + apic_update_irq(s); +} + static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) { apic_report_irq_delivered(!get_bit(s->irr, vector_num)); @@ -321,6 +394,16 @@ static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) set_bit(s->tmr, vector_num); else reset_bit(s->tmr, vector_num); + if (s->vapic_paddr) { + apic_sync_vapic(s, SYNC_ISR_IRR_TO_VAPIC); + /* + * The vcpu thread needs to see the new IRR before we pull its current + * TPR value. That way, if we miss a lowering of the TRP, the guest + * has the chance to notice the new IRR and poll for IRQs on its own. + */ + smp_wmb(); + apic_sync_vapic(s, SYNC_FROM_VAPIC); + } apic_update_irq(s); } @@ -334,6 +417,7 @@ static void apic_eoi(APICCommonState *s) if (!(s->spurious_vec & APIC_SV_DIRECTED_IO) && get_bit(s->tmr, isrv)) { ioapic_eoi_broadcast(isrv); } + apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC); apic_update_irq(s); } @@ -471,15 +555,19 @@ int apic_get_interrupt(DeviceState *d) if (!(s->spurious_vec & APIC_SV_ENABLE)) return -1; + apic_sync_vapic(s, SYNC_FROM_VAPIC); intno = apic_irq_pending(s); if (intno == 0) { + apic_sync_vapic(s, SYNC_TO_VAPIC); return -1; } else if (intno < 0) { + apic_sync_vapic(s, SYNC_TO_VAPIC); return s->spurious_vec & 0xff; } reset_bit(s->irr, intno); set_bit(s->isr, intno); + apic_sync_vapic(s, SYNC_TO_VAPIC); apic_update_irq(s); return intno; } @@ -576,6 +664,10 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr) val = 0x11 | ((APIC_LVT_NB - 1) << 16); /* version 0x11 */ break; case 0x08: + apic_sync_vapic(s, SYNC_FROM_VAPIC); + if (apic_report_tpr_access) { + cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ); + } val = s->tpr; break; case 0x09: @@ -675,7 +767,11 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) case 0x03: break; case 0x08: + if (apic_report_tpr_access) { + cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE); + } s->tpr = val; + apic_sync_vapic(s, SYNC_TO_VAPIC); apic_update_irq(s); break; case 0x09: @@ -737,6 +833,11 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) } } +static void apic_pre_save(APICCommonState *s) +{ + apic_sync_vapic(s, SYNC_FROM_VAPIC); +} + static void apic_post_load(APICCommonState *s) { if (s->timer_expiry != -1) { @@ -770,7 +871,10 @@ static void apic_class_init(ObjectClass *klass, void *data) k->init = apic_init; k->set_base = apic_set_base; k->set_tpr = apic_set_tpr; + k->get_tpr = apic_get_tpr; + k->vapic_base_update = apic_vapic_base_update; k->external_nmi = apic_external_nmi; + k->pre_save = apic_pre_save; k->post_load = apic_post_load; } diff --git a/hw/apic.h b/hw/apic.h index a62d83ba9..d6d6d440e 100644 --- a/hw/apic.h +++ b/hw/apic.h @@ -18,6 +18,8 @@ void cpu_set_apic_tpr(DeviceState *s, uint8_t val); uint8_t cpu_get_apic_tpr(DeviceState *s); void apic_init_reset(DeviceState *s); void apic_sipi(DeviceState *s); +void apic_handle_tpr_access_report(DeviceState *d, target_ulong ip, + TPRAccess access); /* pc.c */ int cpu_is_bsp(CPUState *env); diff --git a/hw/apic_common.c b/hw/apic_common.c index c91f7d539..60b82596e 100644 --- a/hw/apic_common.c +++ b/hw/apic_common.c @@ -20,8 +20,10 @@ #include "apic.h" #include "apic_internal.h" #include "trace.h" +#include "kvm.h" static int apic_irq_delivered; +bool apic_report_tpr_access; void cpu_set_apic_base(DeviceState *d, uint64_t val) { @@ -62,10 +64,46 @@ void cpu_set_apic_tpr(DeviceState *d, uint8_t val) } uint8_t cpu_get_apic_tpr(DeviceState *d) +{ + APICCommonState *s; + APICCommonClass *info; + + if (!d) { + return 0; + } + + s = APIC_COMMON(d); + info = APIC_COMMON_GET_CLASS(s); + + return info->get_tpr(s); +} + +void apic_enable_tpr_access_reporting(DeviceState *d, bool enable) +{ + APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); + + apic_report_tpr_access = enable; + if (info->enable_tpr_reporting) { + info->enable_tpr_reporting(s, enable); + } +} + +void apic_enable_vapic(DeviceState *d, target_phys_addr_t paddr) +{ + APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); + + s->vapic_paddr = paddr; + info->vapic_base_update(s); +} + +void apic_handle_tpr_access_report(DeviceState *d, target_ulong ip, + TPRAccess access) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); - return s ? s->tpr >> 4 : 0; + vapic_report_tpr_access(s->vapic, s->cpu_env, ip, access); } void apic_report_irq_delivered(int delivered) @@ -166,12 +204,16 @@ void apic_init_reset(DeviceState *d) static void apic_reset_common(DeviceState *d) { APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); bool bsp; bsp = cpu_is_bsp(s->cpu_env); s->apicbase = 0xfee00000 | (bsp ? MSR_IA32_APICBASE_BSP : 0) | MSR_IA32_APICBASE_ENABLE; + s->vapic_paddr = 0; + info->vapic_base_update(s); + apic_init_reset(d); if (bsp) { @@ -234,6 +276,7 @@ static int apic_init_common(SysBusDevice *dev) { APICCommonState *s = APIC_COMMON(dev); APICCommonClass *info; + static DeviceState *vapic; static int apic_no; if (apic_no >= MAX_APICS) { @@ -244,10 +287,29 @@ static int apic_init_common(SysBusDevice *dev) info = APIC_COMMON_GET_CLASS(s); info->init(s); - sysbus_init_mmio(&s->busdev, &s->io_memory); + sysbus_init_mmio(dev, &s->io_memory); + + if (!vapic && s->vapic_control & VAPIC_ENABLE_MASK) { + vapic = sysbus_create_simple("kvmvapic", -1, NULL); + } + s->vapic = vapic; + if (apic_report_tpr_access && info->enable_tpr_reporting) { + info->enable_tpr_reporting(s, true); + } + return 0; } +static void apic_dispatch_pre_save(void *opaque) +{ + APICCommonState *s = APIC_COMMON(opaque); + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); + + if (info->pre_save) { + info->pre_save(s); + } +} + static int apic_dispatch_post_load(void *opaque, int version_id) { APICCommonState *s = APIC_COMMON(opaque); @@ -265,6 +327,7 @@ static const VMStateDescription vmstate_apic_common = { .minimum_version_id = 3, .minimum_version_id_old = 1, .load_state_old = apic_load_old, + .pre_save = apic_dispatch_pre_save, .post_load = apic_dispatch_post_load, .fields = (VMStateField[]) { VMSTATE_UINT32(apicbase, APICCommonState), @@ -294,6 +357,8 @@ static const VMStateDescription vmstate_apic_common = { static Property apic_properties_common[] = { DEFINE_PROP_UINT8("id", APICCommonState, id, -1), DEFINE_PROP_PTR("cpu_env", APICCommonState, cpu_env), + DEFINE_PROP_BIT("vapic", APICCommonState, vapic_control, VAPIC_ENABLE_BIT, + true), DEFINE_PROP_END_OF_LIST(), }; diff --git a/hw/apic_internal.h b/hw/apic_internal.h index 0cab01071..60a6a8bda 100644 --- a/hw/apic_internal.h +++ b/hw/apic_internal.h @@ -61,6 +61,9 @@ #define APIC_SV_DIRECTED_IO (1<<12) #define APIC_SV_ENABLE (1<<8) +#define VAPIC_ENABLE_BIT 0 +#define VAPIC_ENABLE_MASK (1 << VAPIC_ENABLE_BIT) + #define MAX_APICS 255 #define MSI_SPACE_SIZE 0x100000 @@ -82,7 +85,11 @@ typedef struct APICCommonClass void (*init)(APICCommonState *s); void (*set_base)(APICCommonState *s, uint64_t val); void (*set_tpr)(APICCommonState *s, uint8_t val); + uint8_t (*get_tpr)(APICCommonState *s); + void (*enable_tpr_reporting)(APICCommonState *s, bool enable); + void (*vapic_base_update)(APICCommonState *s); void (*external_nmi)(APICCommonState *s); + void (*pre_save)(APICCommonState *s); void (*post_load)(APICCommonState *s); } APICCommonClass; @@ -114,9 +121,29 @@ struct APICCommonState { int64_t timer_expiry; int sipi_vector; int wait_for_sipi; + + uint32_t vapic_control; + DeviceState *vapic; + target_phys_addr_t vapic_paddr; /* note: persistence via kvmvapic */ }; +typedef struct VAPICState { + uint8_t tpr; + uint8_t isr; + uint8_t zero; + uint8_t irr; + uint8_t enabled; +} QEMU_PACKED VAPICState; + +extern bool apic_report_tpr_access; + void apic_report_irq_delivered(int delivered); bool apic_next_timer(APICCommonState *s, int64_t current_time); +void apic_enable_tpr_access_reporting(DeviceState *d, bool enable); +void apic_enable_vapic(DeviceState *d, target_phys_addr_t paddr); +void apic_poll_irq(DeviceState *d); + +void vapic_report_tpr_access(DeviceState *dev, void *cpu, target_ulong ip, + TPRAccess access); #endif /* !QEMU_APIC_INTERNAL_H */ diff --git a/hw/kvm/apic.c b/hw/kvm/apic.c index 5bb0a4b9f..9ca68f81a 100644 --- a/hw/kvm/apic.c +++ b/hw/kvm/apic.c @@ -92,6 +92,35 @@ static void kvm_apic_set_tpr(APICCommonState *s, uint8_t val) s->tpr = (val & 0x0f) << 4; } +static uint8_t kvm_apic_get_tpr(APICCommonState *s) +{ + return s->tpr >> 4; +} + +static void kvm_apic_enable_tpr_reporting(APICCommonState *s, bool enable) +{ + struct kvm_tpr_access_ctl ctl = { + .enabled = enable + }; + + kvm_vcpu_ioctl(s->cpu_env, KVM_TPR_ACCESS_REPORTING, &ctl); +} + +static void kvm_apic_vapic_base_update(APICCommonState *s) +{ + struct kvm_vapic_addr vapid_addr = { + .vapic_addr = s->vapic_paddr, + }; + int ret; + + ret = kvm_vcpu_ioctl(s->cpu_env, KVM_SET_VAPIC_ADDR, &vapid_addr); + if (ret < 0) { + fprintf(stderr, "KVM: setting VAPIC address failed (%s)\n", + strerror(-ret)); + abort(); + } +} + static void do_inject_external_nmi(void *data) { APICCommonState *s = data; @@ -129,6 +158,9 @@ static void kvm_apic_class_init(ObjectClass *klass, void *data) k->init = kvm_apic_init; k->set_base = kvm_apic_set_base; k->set_tpr = kvm_apic_set_tpr; + k->get_tpr = kvm_apic_get_tpr; + k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting; + k->vapic_base_update = kvm_apic_vapic_base_update; k->external_nmi = kvm_apic_external_nmi; } diff --git a/hw/kvmvapic.c b/hw/kvmvapic.c new file mode 100644 index 000000000..36ccfbcdb --- /dev/null +++ b/hw/kvmvapic.c @@ -0,0 +1,805 @@ +/* + * TPR optimization for 32-bit Windows guests (XP and Server 2003) + * + * Copyright (C) 2007-2008 Qumranet Technologies + * Copyright (C) 2012 Jan Kiszka, Siemens AG + * + * This work is licensed under the terms of the GNU GPL version 2, or + * (at your option) any later version. See the COPYING file in the + * top-level directory. + */ +#include "sysemu.h" +#include "cpus.h" +#include "kvm.h" +#include "apic_internal.h" + +#define APIC_DEFAULT_ADDRESS 0xfee00000 + +#define VAPIC_IO_PORT 0x7e + +#define VAPIC_CPU_SHIFT 7 + +#define ROM_BLOCK_SIZE 512 +#define ROM_BLOCK_MASK (~(ROM_BLOCK_SIZE - 1)) + +typedef enum VAPICMode { + VAPIC_INACTIVE = 0, + VAPIC_ACTIVE = 1, + VAPIC_STANDBY = 2, +} VAPICMode; + +typedef struct VAPICHandlers { + uint32_t set_tpr; + uint32_t set_tpr_eax; + uint32_t get_tpr[8]; + uint32_t get_tpr_stack; +} QEMU_PACKED VAPICHandlers; + +typedef struct GuestROMState { + char signature[8]; + uint32_t vaddr; + uint32_t fixup_start; + uint32_t fixup_end; + uint32_t vapic_vaddr; + uint32_t vapic_size; + uint32_t vcpu_shift; + uint32_t real_tpr_addr; + VAPICHandlers up; + VAPICHandlers mp; +} QEMU_PACKED GuestROMState; + +typedef struct VAPICROMState { + SysBusDevice busdev; + MemoryRegion io; + MemoryRegion rom; + uint32_t state; + uint32_t rom_state_paddr; + uint32_t rom_state_vaddr; + uint32_t vapic_paddr; + uint32_t real_tpr_addr; + GuestROMState rom_state; + size_t rom_size; + bool rom_mapped_writable; +} VAPICROMState; + +#define TPR_INSTR_ABS_MODRM 0x1 +#define TPR_INSTR_MATCH_MODRM_REG 0x2 + +typedef struct TPRInstruction { + uint8_t opcode; + uint8_t modrm_reg; + unsigned int flags; + TPRAccess access; + size_t length; + off_t addr_offset; +} TPRInstruction; + +/* must be sorted by length, shortest first */ +static const TPRInstruction tpr_instr[] = { + { /* mov abs to eax */ + .opcode = 0xa1, + .access = TPR_ACCESS_READ, + .length = 5, + .addr_offset = 1, + }, + { /* mov eax to abs */ + .opcode = 0xa3, + .access = TPR_ACCESS_WRITE, + .length = 5, + .addr_offset = 1, + }, + { /* mov r32 to r/m32 */ + .opcode = 0x89, + .flags = TPR_INSTR_ABS_MODRM, + .access = TPR_ACCESS_WRITE, + .length = 6, + .addr_offset = 2, + }, + { /* mov r/m32 to r32 */ + .opcode = 0x8b, + .flags = TPR_INSTR_ABS_MODRM, + .access = TPR_ACCESS_READ, + .length = 6, + .addr_offset = 2, + }, + { /* push r/m32 */ + .opcode = 0xff, + .modrm_reg = 6, + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, + .access = TPR_ACCESS_READ, + .length = 6, + .addr_offset = 2, + }, + { /* mov imm32, r/m32 (c7/0) */ + .opcode = 0xc7, + .modrm_reg = 0, + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, + .access = TPR_ACCESS_WRITE, + .length = 10, + .addr_offset = 2, + }, +}; + +static void read_guest_rom_state(VAPICROMState *s) +{ + cpu_physical_memory_rw(s->rom_state_paddr, (void *)&s->rom_state, + sizeof(GuestROMState), 0); +} + +static void write_guest_rom_state(VAPICROMState *s) +{ + cpu_physical_memory_rw(s->rom_state_paddr, (void *)&s->rom_state, + sizeof(GuestROMState), 1); +} + +static void update_guest_rom_state(VAPICROMState *s) +{ + read_guest_rom_state(s); + + s->rom_state.real_tpr_addr = cpu_to_le32(s->real_tpr_addr); + s->rom_state.vcpu_shift = cpu_to_le32(VAPIC_CPU_SHIFT); + + write_guest_rom_state(s); +} + +static int find_real_tpr_addr(VAPICROMState *s, CPUState *env) +{ + target_phys_addr_t paddr; + target_ulong addr; + + if (s->state == VAPIC_ACTIVE) { + return 0; + } + /* + * If there is no prior TPR access instruction we could analyze (which is + * the case after resume from hibernation), we need to scan the possible + * virtual address space for the APIC mapping. + */ + for (addr = 0xfffff000; addr >= 0x80000000; addr -= TARGET_PAGE_SIZE) { + paddr = cpu_get_phys_page_debug(env, addr); + if (paddr != APIC_DEFAULT_ADDRESS) { + continue; + } + s->real_tpr_addr = addr + 0x80; + update_guest_rom_state(s); + return 0; + } + return -1; +} + +static uint8_t modrm_reg(uint8_t modrm) +{ + return (modrm >> 3) & 7; +} + +static bool is_abs_modrm(uint8_t modrm) +{ + return (modrm & 0xc7) == 0x05; +} + +static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr) +{ + return opcode[0] == instr->opcode && + (!(instr->flags & TPR_INSTR_ABS_MODRM) || is_abs_modrm(opcode[1])) && + (!(instr->flags & TPR_INSTR_MATCH_MODRM_REG) || + modrm_reg(opcode[1]) == instr->modrm_reg); +} + +static int evaluate_tpr_instruction(VAPICROMState *s, CPUState *env, + target_ulong *pip, TPRAccess access) +{ + const TPRInstruction *instr; + target_ulong ip = *pip; + uint8_t opcode[2]; + uint32_t real_tpr_addr; + int i; + + if ((ip & 0xf0000000ULL) != 0x80000000ULL && + (ip & 0xf0000000ULL) != 0xe0000000ULL) { + return -1; + } + + /* + * Early Windows 2003 SMP initialization contains a + * + * mov imm32, r/m32 + * + * instruction that is patched by TPR optimization. The problem is that + * RSP, used by the patched instruction, is zero, so the guest gets a + * double fault and dies. + */ + if (env->regs[R_ESP] == 0) { + return -1; + } + + if (kvm_enabled() && !kvm_irqchip_in_kernel()) { + /* + * KVM without kernel-based TPR access reporting will pass an IP that + * points after the accessing instruction. So we need to look backward + * to find the reason. + */ + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { + instr = &tpr_instr[i]; + if (instr->access != access) { + continue; + } + if (cpu_memory_rw_debug(env, ip - instr->length, opcode, + sizeof(opcode), 0) < 0) { + return -1; + } + if (opcode_matches(opcode, instr)) { + ip -= instr->length; + goto instruction_ok; + } + } + return -1; + } else { + if (cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0) < 0) { + return -1; + } + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { + instr = &tpr_instr[i]; + if (opcode_matches(opcode, instr)) { + goto instruction_ok; + } + } + return -1; + } + +instruction_ok: + /* + * Grab the virtual TPR address from the instruction + * and update the cached values. + */ + if (cpu_memory_rw_debug(env, ip + instr->addr_offset, + (void *)&real_tpr_addr, + sizeof(real_tpr_addr), 0) < 0) { + return -1; + } + real_tpr_addr = le32_to_cpu(real_tpr_addr); + if ((real_tpr_addr & 0xfff) != 0x80) { + return -1; + } + s->real_tpr_addr = real_tpr_addr; + update_guest_rom_state(s); + + *pip = ip; + return 0; +} + +static int update_rom_mapping(VAPICROMState *s, CPUState *env, target_ulong ip) +{ + target_phys_addr_t paddr; + uint32_t rom_state_vaddr; + uint32_t pos, patch, offset; + + /* nothing to do if already activated */ + if (s->state == VAPIC_ACTIVE) { + return 0; + } + + /* bail out if ROM init code was not executed (missing ROM?) */ + if (s->state == VAPIC_INACTIVE) { + return -1; + } + + /* find out virtual address of the ROM */ + rom_state_vaddr = s->rom_state_paddr + (ip & 0xf0000000); + paddr = cpu_get_phys_page_debug(env, rom_state_vaddr); + if (paddr == -1) { + return -1; + } + paddr += rom_state_vaddr & ~TARGET_PAGE_MASK; + if (paddr != s->rom_state_paddr) { + return -1; + } + read_guest_rom_state(s); + if (memcmp(s->rom_state.signature, "kvm aPiC", 8) != 0) { + return -1; + } + s->rom_state_vaddr = rom_state_vaddr; + + /* fixup addresses in ROM if needed */ + if (rom_state_vaddr == le32_to_cpu(s->rom_state.vaddr)) { + return 0; + } + for (pos = le32_to_cpu(s->rom_state.fixup_start); + pos < le32_to_cpu(s->rom_state.fixup_end); + pos += 4) { + cpu_physical_memory_rw(paddr + pos - s->rom_state.vaddr, + (void *)&offset, sizeof(offset), 0); + offset = le32_to_cpu(offset); + cpu_physical_memory_rw(paddr + offset, (void *)&patch, + sizeof(patch), 0); + patch = le32_to_cpu(patch); + patch += rom_state_vaddr - le32_to_cpu(s->rom_state.vaddr); + patch = cpu_to_le32(patch); + cpu_physical_memory_rw(paddr + offset, (void *)&patch, + sizeof(patch), 1); + } + read_guest_rom_state(s); + s->vapic_paddr = paddr + le32_to_cpu(s->rom_state.vapic_vaddr) - + le32_to_cpu(s->rom_state.vaddr); + + return 0; +} + +/* + * Tries to read the unique processor number from the Kernel Processor Control + * Region (KPCR) of 32-bit Windows XP and Server 2003. Returns -1 if the KPCR + * cannot be accessed or is considered invalid. This also ensures that we are + * not patching the wrong guest. + */ +static int get_kpcr_number(CPUState *env) +{ + struct kpcr { + uint8_t fill1[0x1c]; + uint32_t self; + uint8_t fill2[0x31]; + uint8_t number; + } QEMU_PACKED kpcr; + + if (cpu_memory_rw_debug(env, env->segs[R_FS].base, + (void *)&kpcr, sizeof(kpcr), 0) < 0 || + kpcr.self != env->segs[R_FS].base) { + return -1; + } + return kpcr.number; +} + +static int vapic_enable(VAPICROMState *s, CPUState *env) +{ + int cpu_number = get_kpcr_number(env); + target_phys_addr_t vapic_paddr; + static const uint8_t enabled = 1; + + if (cpu_number < 0) { + return -1; + } + vapic_paddr = s->vapic_paddr + + (((target_phys_addr_t)cpu_number) << VAPIC_CPU_SHIFT); + cpu_physical_memory_rw(vapic_paddr + offsetof(VAPICState, enabled), + (void *)&enabled, sizeof(enabled), 1); + apic_enable_vapic(env->apic_state, vapic_paddr); + + s->state = VAPIC_ACTIVE; + + return 0; +} + +static void patch_byte(CPUState *env, target_ulong addr, uint8_t byte) +{ + cpu_memory_rw_debug(env, addr, &byte, 1, 1); +} + +static void patch_call(VAPICROMState *s, CPUState *env, target_ulong ip, + uint32_t target) +{ + uint32_t offset; + + offset = cpu_to_le32(target - ip - 5); + patch_byte(env, ip, 0xe8); /* call near */ + cpu_memory_rw_debug(env, ip + 1, (void *)&offset, sizeof(offset), 1); +} + +static void patch_instruction(VAPICROMState *s, CPUState *env, target_ulong ip) +{ + target_phys_addr_t paddr; + VAPICHandlers *handlers; + uint8_t opcode[2]; + uint32_t imm32; + + if (smp_cpus == 1) { + handlers = &s->rom_state.up; + } else { + handlers = &s->rom_state.mp; + } + + pause_all_vcpus(); + + cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0); + + switch (opcode[0]) { + case 0x89: /* mov r32 to r/m32 */ + patch_byte(env, ip, 0x50 + modrm_reg(opcode[1])); /* push reg */ + patch_call(s, env, ip + 1, handlers->set_tpr); + break; + case 0x8b: /* mov r/m32 to r32 */ + patch_byte(env, ip, 0x90); + patch_call(s, env, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]); + break; + case 0xa1: /* mov abs to eax */ + patch_call(s, env, ip, handlers->get_tpr[0]); + break; + case 0xa3: /* mov eax to abs */ + patch_call(s, env, ip, handlers->set_tpr_eax); + break; + case 0xc7: /* mov imm32, r/m32 (c7/0) */ + patch_byte(env, ip, 0x68); /* push imm32 */ + cpu_memory_rw_debug(env, ip + 6, (void *)&imm32, sizeof(imm32), 0); + cpu_memory_rw_debug(env, ip + 1, (void *)&imm32, sizeof(imm32), 1); + patch_call(s, env, ip + 5, handlers->set_tpr); + break; + case 0xff: /* push r/m32 */ + patch_byte(env, ip, 0x50); /* push eax */ + patch_call(s, env, ip + 1, handlers->get_tpr_stack); + break; + default: + abort(); + } + + resume_all_vcpus(); + + paddr = cpu_get_phys_page_debug(env, ip); + paddr += ip & ~TARGET_PAGE_MASK; + tb_invalidate_phys_page_range(paddr, paddr + 1, 1); +} + +void vapic_report_tpr_access(DeviceState *dev, void *cpu, target_ulong ip, + TPRAccess access) +{ + VAPICROMState *s = DO_UPCAST(VAPICROMState, busdev.qdev, dev); + CPUState *env = cpu; + + cpu_synchronize_state(env); + + if (evaluate_tpr_instruction(s, env, &ip, access) < 0) { + if (s->state == VAPIC_ACTIVE) { + vapic_enable(s, env); + } + return; + } + if (update_rom_mapping(s, env, ip) < 0) { + return; + } + if (vapic_enable(s, env) < 0) { + return; + } + patch_instruction(s, env, ip); +} + +typedef struct VAPICEnableTPRReporting { + DeviceState *apic; + bool enable; +} VAPICEnableTPRReporting; + +static void vapic_do_enable_tpr_reporting(void *data) +{ + VAPICEnableTPRReporting *info = data; + + apic_enable_tpr_access_reporting(info->apic, info->enable); +} + +static void vapic_enable_tpr_reporting(bool enable) +{ + VAPICEnableTPRReporting info = { + .enable = enable, + }; + CPUState *env; + + for (env = first_cpu; env != NULL; env = env->next_cpu) { + info.apic = env->apic_state; + run_on_cpu(env, vapic_do_enable_tpr_reporting, &info); + } +} + +static void vapic_reset(DeviceState *dev) +{ + VAPICROMState *s = DO_UPCAST(VAPICROMState, busdev.qdev, dev); + + if (s->state == VAPIC_ACTIVE) { + s->state = VAPIC_STANDBY; + } + vapic_enable_tpr_reporting(false); +} + +/* + * Set the IRQ polling hypercalls to the supported variant: + * - vmcall if using KVM in-kernel irqchip + * - 32-bit VAPIC port write otherwise + */ +static int patch_hypercalls(VAPICROMState *s) +{ + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; + static const uint8_t vmcall_pattern[] = { /* vmcall */ + 0xb8, 0x1, 0, 0, 0, 0xf, 0x1, 0xc1 + }; + static const uint8_t outl_pattern[] = { /* nop; outl %eax,0x7e */ + 0xb8, 0x1, 0, 0, 0, 0x90, 0xe7, 0x7e + }; + uint8_t alternates[2]; + const uint8_t *pattern; + const uint8_t *patch; + int patches = 0; + off_t pos; + uint8_t *rom; + + rom = g_malloc(s->rom_size); + cpu_physical_memory_rw(rom_paddr, rom, s->rom_size, 0); + + for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) { + if (kvm_irqchip_in_kernel()) { + pattern = outl_pattern; + alternates[0] = outl_pattern[7]; + alternates[1] = outl_pattern[7]; + patch = &vmcall_pattern[5]; + } else { + pattern = vmcall_pattern; + alternates[0] = vmcall_pattern[7]; + alternates[1] = 0xd9; /* AMD's VMMCALL */ + patch = &outl_pattern[5]; + } + if (memcmp(rom + pos, pattern, 7) == 0 && + (rom[pos + 7] == alternates[0] || rom[pos + 7] == alternates[1])) { + cpu_physical_memory_rw(rom_paddr + pos + 5, (uint8_t *)patch, + 3, 1); + /* + * Don't flush the tb here. Under ordinary conditions, the patched + * calls are miles away from the current IP. Under malicious + * conditions, the guest could trick us to crash. + */ + } + } + + g_free(rom); + + if (patches != 0 && patches != 2) { + return -1; + } + + return 0; +} + +/* + * For TCG mode or the time KVM honors read-only memory regions, we need to + * enable write access to the option ROM so that variables can be updated by + * the guest. + */ +static void vapic_map_rom_writable(VAPICROMState *s) +{ + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; + MemoryRegionSection section; + MemoryRegion *as; + size_t rom_size; + uint8_t *ram; + + as = sysbus_address_space(&s->busdev); + + if (s->rom_mapped_writable) { + memory_region_del_subregion(as, &s->rom); + memory_region_destroy(&s->rom); + } + + /* grab RAM memory region (region @rom_paddr may still be pc.rom) */ + section = memory_region_find(as, 0, 1); + + /* read ROM size from RAM region */ + ram = memory_region_get_ram_ptr(section.mr); + rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE; + s->rom_size = rom_size; + + /* We need to round up to avoid creating subpages + * from which we cannot run code. */ + rom_size = TARGET_PAGE_ALIGN(rom_size); + + memory_region_init_alias(&s->rom, "kvmvapic-rom", section.mr, rom_paddr, + rom_size); + memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000); + s->rom_mapped_writable = true; +} + +static int vapic_prepare(VAPICROMState *s) +{ + vapic_map_rom_writable(s); + + if (patch_hypercalls(s) < 0) { + return -1; + } + + vapic_enable_tpr_reporting(true); + + return 0; +} + +static void vapic_write(void *opaque, target_phys_addr_t addr, uint64_t data, + unsigned int size) +{ + CPUState *env = cpu_single_env; + target_phys_addr_t rom_paddr; + VAPICROMState *s = opaque; + + cpu_synchronize_state(env); + + /* + * The VAPIC supports two PIO-based hypercalls, both via port 0x7E. + * o 16-bit write access: + * Reports the option ROM initialization to the hypervisor. Written + * value is the offset of the state structure in the ROM. + * o 8-bit write access: + * Reactivates the VAPIC after a guest hibernation, i.e. after the + * option ROM content has been re-initialized by a guest power cycle. + * o 32-bit write access: + * Poll for pending IRQs, considering the current VAPIC state. + */ + switch (size) { + case 2: + if (s->state == VAPIC_INACTIVE) { + rom_paddr = (env->segs[R_CS].base + env->eip) & ROM_BLOCK_MASK; + s->rom_state_paddr = rom_paddr + data; + + s->state = VAPIC_STANDBY; + } + if (vapic_prepare(s) < 0) { + s->state = VAPIC_INACTIVE; + break; + } + break; + case 1: + if (kvm_enabled()) { + /* + * Disable triggering instruction in ROM by writing a NOP. + * + * We cannot do this in TCG mode as the reported IP is not + * accurate. + */ + pause_all_vcpus(); + patch_byte(env, env->eip - 2, 0x66); + patch_byte(env, env->eip - 1, 0x90); + resume_all_vcpus(); + } + + if (s->state == VAPIC_ACTIVE) { + break; + } + if (update_rom_mapping(s, env, env->eip) < 0) { + break; + } + if (find_real_tpr_addr(s, env) < 0) { + break; + } + vapic_enable(s, env); + break; + default: + case 4: + if (!kvm_irqchip_in_kernel()) { + apic_poll_irq(env->apic_state); + } + break; + } +} + +static const MemoryRegionOps vapic_ops = { + .write = vapic_write, + .endianness = DEVICE_NATIVE_ENDIAN, +}; + +static int vapic_init(SysBusDevice *dev) +{ + VAPICROMState *s = FROM_SYSBUS(VAPICROMState, dev); + + memory_region_init_io(&s->io, &vapic_ops, s, "kvmvapic", 2); + sysbus_add_io(dev, VAPIC_IO_PORT, &s->io); + sysbus_init_ioports(dev, VAPIC_IO_PORT, 2); + + option_rom[nb_option_roms].name = "kvmvapic.bin"; + option_rom[nb_option_roms].bootindex = -1; + nb_option_roms++; + + return 0; +} + +static void do_vapic_enable(void *data) +{ + VAPICROMState *s = data; + + vapic_enable(s, first_cpu); +} + +static int vapic_post_load(void *opaque, int version_id) +{ + VAPICROMState *s = opaque; + uint8_t *zero; + + /* + * The old implementation of qemu-kvm did not provide the state + * VAPIC_STANDBY. Reconstruct it. + */ + if (s->state == VAPIC_INACTIVE && s->rom_state_paddr != 0) { + s->state = VAPIC_STANDBY; + } + + if (s->state != VAPIC_INACTIVE) { + if (vapic_prepare(s) < 0) { + return -1; + } + } + if (s->state == VAPIC_ACTIVE) { + if (smp_cpus == 1) { + run_on_cpu(first_cpu, do_vapic_enable, s); + } else { + zero = g_malloc0(s->rom_state.vapic_size); + cpu_physical_memory_rw(s->vapic_paddr, zero, + s->rom_state.vapic_size, 1); + g_free(zero); + } + } + + return 0; +} + +static const VMStateDescription vmstate_handlers = { + .name = "kvmvapic-handlers", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_UINT32(set_tpr, VAPICHandlers), + VMSTATE_UINT32(set_tpr_eax, VAPICHandlers), + VMSTATE_UINT32_ARRAY(get_tpr, VAPICHandlers, 8), + VMSTATE_UINT32(get_tpr_stack, VAPICHandlers), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_guest_rom = { + .name = "kvmvapic-guest-rom", + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .fields = (VMStateField[]) { + VMSTATE_UNUSED(8), /* signature */ + VMSTATE_UINT32(vaddr, GuestROMState), + VMSTATE_UINT32(fixup_start, GuestROMState), + VMSTATE_UINT32(fixup_end, GuestROMState), + VMSTATE_UINT32(vapic_vaddr, GuestROMState), + VMSTATE_UINT32(vapic_size, GuestROMState), + VMSTATE_UINT32(vcpu_shift, GuestROMState), + VMSTATE_UINT32(real_tpr_addr, GuestROMState), + VMSTATE_STRUCT(up, GuestROMState, 0, vmstate_handlers, VAPICHandlers), + VMSTATE_STRUCT(mp, GuestROMState, 0, vmstate_handlers, VAPICHandlers), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_vapic = { + .name = "kvm-tpr-opt", /* compatible with qemu-kvm VAPIC */ + .version_id = 1, + .minimum_version_id = 1, + .minimum_version_id_old = 1, + .post_load = vapic_post_load, + .fields = (VMStateField[]) { + VMSTATE_STRUCT(rom_state, VAPICROMState, 0, vmstate_guest_rom, + GuestROMState), + VMSTATE_UINT32(state, VAPICROMState), + VMSTATE_UINT32(real_tpr_addr, VAPICROMState), + VMSTATE_UINT32(rom_state_vaddr, VAPICROMState), + VMSTATE_UINT32(vapic_paddr, VAPICROMState), + VMSTATE_UINT32(rom_state_paddr, VAPICROMState), + VMSTATE_END_OF_LIST() + } +}; + +static void vapic_class_init(ObjectClass *klass, void *data) +{ + SysBusDeviceClass *sc = SYS_BUS_DEVICE_CLASS(klass); + DeviceClass *dc = DEVICE_CLASS(klass); + + dc->no_user = 1; + dc->reset = vapic_reset; + dc->vmsd = &vmstate_vapic; + sc->init = vapic_init; +} + +static TypeInfo vapic_type = { + .name = "kvmvapic", + .parent = TYPE_SYS_BUS_DEVICE, + .instance_size = sizeof(VAPICROMState), + .class_init = vapic_class_init, +}; + +static void vapic_register(void) +{ + type_register_static(&vapic_type); +} + +type_init(vapic_register); diff --git a/hw/mc146818rtc.c b/hw/mc146818rtc.c index a46fdfc48..8b5cf8c81 100644 --- a/hw/mc146818rtc.c +++ b/hw/mc146818rtc.c @@ -25,10 +25,13 @@ #include "qemu-timer.h" #include "sysemu.h" #include "pc.h" -#include "apic.h" #include "isa.h" #include "mc146818rtc.h" +#ifdef TARGET_I386 +#include "apic.h" +#endif + //#define DEBUG_CMOS //#define DEBUG_COALESCED diff --git a/kvm-all.c b/kvm-all.c index c4babdac0..e2cbc0302 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -1118,8 +1118,6 @@ int kvm_cpu_exec(CPUState *env) return EXCP_HLT; } - cpu_single_env = env; - do { if (env->kvm_vcpu_dirty) { kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE); @@ -1136,13 +1134,11 @@ int kvm_cpu_exec(CPUState *env) */ qemu_cpu_kick_self(); } - cpu_single_env = NULL; qemu_mutex_unlock_iothread(); run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0); qemu_mutex_lock_iothread(); - cpu_single_env = env; kvm_arch_post_run(env, run); kvm_flush_coalesced_mmio_buffer(); @@ -1206,7 +1202,6 @@ int kvm_cpu_exec(CPUState *env) } env->exit_request = 0; - cpu_single_env = NULL; return ret; } diff --git a/pc-bios/kvmvapic.bin b/pc-bios/kvmvapic.bin new file mode 100755 index 000000000..045f5c288 Binary files /dev/null and b/pc-bios/kvmvapic.bin differ diff --git a/pc-bios/optionrom/Makefile b/pc-bios/optionrom/Makefile index 2caf7e6b6..f6b402713 100644 --- a/pc-bios/optionrom/Makefile +++ b/pc-bios/optionrom/Makefile @@ -14,7 +14,7 @@ CFLAGS += -I$(SRC_PATH) CFLAGS += $(call cc-option, $(CFLAGS), -fno-stack-protector) QEMU_CFLAGS = $(CFLAGS) -build-all: multiboot.bin linuxboot.bin +build-all: multiboot.bin linuxboot.bin kvmvapic.bin # suppress auto-removal of intermediate files .SECONDARY: diff --git a/pc-bios/optionrom/kvmvapic.S b/pc-bios/optionrom/kvmvapic.S new file mode 100644 index 000000000..aa17a402d --- /dev/null +++ b/pc-bios/optionrom/kvmvapic.S @@ -0,0 +1,335 @@ +# +# Local APIC acceleration for Windows XP and related guests +# +# Copyright 2011 Red Hat, Inc. and/or its affiliates +# +# Author: Avi Kivity +# +# This work is licensed under the terms of the GNU GPL, version 2, or (at your +# option) any later version. See the COPYING file in the top-level directory. +# + +#include "optionrom.h" + +OPTION_ROM_START + + # clear vapic area: firmware load using rep insb may cause + # stale tpr/isr/irr data to corrupt the vapic area. + push %es + push %cs + pop %es + xor %ax, %ax + mov $vapic_size/2, %cx + lea vapic, %di + cld + rep stosw + pop %es + + # announce presence to the hypervisor + mov $vapic_base, %ax + out %ax, $0x7e + + lret + + .code32 +vapic_size = 2*4096 + +.macro fixup delta=-4 +777: + .text 1 + .long 777b + \delta - vapic_base + .text 0 +.endm + +.macro reenable_vtpr + out %al, $0x7e +.endm + +.text 1 + fixup_start = . +.text 0 + +.align 16 + +vapic_base: + .ascii "kvm aPiC" + + /* relocation data */ + .long vapic_base ; fixup + .long fixup_start ; fixup + .long fixup_end ; fixup + + .long vapic ; fixup + .long vapic_size +vcpu_shift: + .long 0 +real_tpr: + .long 0 + .long up_set_tpr ; fixup + .long up_set_tpr_eax ; fixup + .long up_get_tpr_eax ; fixup + .long up_get_tpr_ecx ; fixup + .long up_get_tpr_edx ; fixup + .long up_get_tpr_ebx ; fixup + .long 0 /* esp. won't work. */ + .long up_get_tpr_ebp ; fixup + .long up_get_tpr_esi ; fixup + .long up_get_tpr_edi ; fixup + .long up_get_tpr_stack ; fixup + .long mp_set_tpr ; fixup + .long mp_set_tpr_eax ; fixup + .long mp_get_tpr_eax ; fixup + .long mp_get_tpr_ecx ; fixup + .long mp_get_tpr_edx ; fixup + .long mp_get_tpr_ebx ; fixup + .long 0 /* esp. won't work. */ + .long mp_get_tpr_ebp ; fixup + .long mp_get_tpr_esi ; fixup + .long mp_get_tpr_edi ; fixup + .long mp_get_tpr_stack ; fixup + +.macro kvm_hypercall + .byte 0x0f, 0x01, 0xc1 +.endm + +kvm_hypercall_vapic_poll_irq = 1 + +pcr_cpu = 0x51 + +.align 64 + +mp_get_tpr_eax: + pushf + cli + reenable_vtpr + push %ecx + + fs/movzbl pcr_cpu, %eax + + mov vcpu_shift, %ecx ; fixup + shl %cl, %eax + testb $1, vapic+4(%eax) ; fixup delta=-5 + jz mp_get_tpr_bad + movzbl vapic(%eax), %eax ; fixup + +mp_get_tpr_out: + pop %ecx + popf + ret + +mp_get_tpr_bad: + mov real_tpr, %eax ; fixup + mov (%eax), %eax + jmp mp_get_tpr_out + +mp_get_tpr_ebx: + mov %eax, %ebx + call mp_get_tpr_eax + xchg %eax, %ebx + ret + +mp_get_tpr_ecx: + mov %eax, %ecx + call mp_get_tpr_eax + xchg %eax, %ecx + ret + +mp_get_tpr_edx: + mov %eax, %edx + call mp_get_tpr_eax + xchg %eax, %edx + ret + +mp_get_tpr_esi: + mov %eax, %esi + call mp_get_tpr_eax + xchg %eax, %esi + ret + +mp_get_tpr_edi: + mov %eax, %edi + call mp_get_tpr_edi + xchg %eax, %edi + ret + +mp_get_tpr_ebp: + mov %eax, %ebp + call mp_get_tpr_eax + xchg %eax, %ebp + ret + +mp_get_tpr_stack: + call mp_get_tpr_eax + xchg %eax, 4(%esp) + ret + +mp_set_tpr_eax: + push %eax + call mp_set_tpr + ret + +mp_set_tpr: + pushf + push %eax + push %ecx + push %edx + push %ebx + cli + reenable_vtpr + +mp_set_tpr_failed: + fs/movzbl pcr_cpu, %edx + + mov vcpu_shift, %ecx ; fixup + shl %cl, %edx + + testb $1, vapic+4(%edx) ; fixup delta=-5 + jz mp_set_tpr_bad + + mov vapic(%edx), %eax ; fixup + + mov %eax, %ebx + mov 24(%esp), %bl + + /* %ebx = new vapic (%bl = tpr, %bh = isr, %b3 = irr) */ + + lock cmpxchg %ebx, vapic(%edx) ; fixup + jnz mp_set_tpr_failed + + /* compute ppr */ + cmp %bh, %bl + jae mp_tpr_is_bigger +mp_isr_is_bigger: + mov %bh, %bl +mp_tpr_is_bigger: + /* %bl = ppr */ + rol $8, %ebx + /* now: %bl = irr, %bh = ppr */ + cmp %bh, %bl + ja mp_set_tpr_poll_irq + +mp_set_tpr_out: + pop %ebx + pop %edx + pop %ecx + pop %eax + popf + ret $4 + +mp_set_tpr_poll_irq: + mov $kvm_hypercall_vapic_poll_irq, %eax + kvm_hypercall + jmp mp_set_tpr_out + +mp_set_tpr_bad: + mov 24(%esp), %ecx + mov real_tpr, %eax ; fixup + mov %ecx, (%eax) + jmp mp_set_tpr_out + +up_get_tpr_eax: + reenable_vtpr + movzbl vapic, %eax ; fixup + ret + +up_get_tpr_ebx: + reenable_vtpr + movzbl vapic, %ebx ; fixup + ret + +up_get_tpr_ecx: + reenable_vtpr + movzbl vapic, %ecx ; fixup + ret + +up_get_tpr_edx: + reenable_vtpr + movzbl vapic, %edx ; fixup + ret + +up_get_tpr_esi: + reenable_vtpr + movzbl vapic, %esi ; fixup + ret + +up_get_tpr_edi: + reenable_vtpr + movzbl vapic, %edi ; fixup + ret + +up_get_tpr_ebp: + reenable_vtpr + movzbl vapic, %ebp ; fixup + ret + +up_get_tpr_stack: + reenable_vtpr + movzbl vapic, %eax ; fixup + xchg %eax, 4(%esp) + ret + +up_set_tpr_eax: + push %eax + call up_set_tpr + ret + +up_set_tpr: + pushf + push %eax + push %ebx + reenable_vtpr + +up_set_tpr_failed: + mov vapic, %eax ; fixup + + mov %eax, %ebx + mov 16(%esp), %bl + + /* %ebx = new vapic (%bl = tpr, %bh = isr, %b3 = irr) */ + + lock cmpxchg %ebx, vapic ; fixup + jnz up_set_tpr_failed + + /* compute ppr */ + cmp %bh, %bl + jae up_tpr_is_bigger +up_isr_is_bigger: + mov %bh, %bl +up_tpr_is_bigger: + /* %bl = ppr */ + rol $8, %ebx + /* now: %bl = irr, %bh = ppr */ + cmp %bh, %bl + ja up_set_tpr_poll_irq + +up_set_tpr_out: + pop %ebx + pop %eax + popf + ret $4 + +up_set_tpr_poll_irq: + mov $kvm_hypercall_vapic_poll_irq, %eax + kvm_hypercall + jmp up_set_tpr_out + +.text 1 + fixup_end = . +.text 0 + +/* + * vapic format: + * per-vcpu records of size 2^vcpu shift. + * byte 0: tpr (r/w) + * byte 1: highest in-service interrupt (isr) (r/o); bits 3:0 are zero + * byte 2: zero (r/o) + * byte 3: highest pending interrupt (irr) (r/o) + */ +.text 2 + +.align 128 + +vapic: +. = . + vapic_size + +OPTION_ROM_END diff --git a/pc-bios/optionrom/optionrom.h b/pc-bios/optionrom/optionrom.h index aa783deed..3daf7da49 100644 --- a/pc-bios/optionrom/optionrom.h +++ b/pc-bios/optionrom/optionrom.h @@ -124,7 +124,8 @@ movw %ax, %ds; #define OPTION_ROM_END \ - .align 512, 0; \ + .byte 0; \ + .align 512, 0; \ _end: #define BOOT_ROM_END \ diff --git a/target-i386/cpu.h b/target-i386/cpu.h index 37dde7958..196b0c5c4 100644 --- a/target-i386/cpu.h +++ b/target-i386/cpu.h @@ -482,6 +482,7 @@ #define CPU_INTERRUPT_VIRQ CPU_INTERRUPT_TGT_INT_0 #define CPU_INTERRUPT_INIT CPU_INTERRUPT_TGT_INT_1 #define CPU_INTERRUPT_SIPI CPU_INTERRUPT_TGT_INT_2 +#define CPU_INTERRUPT_TPR CPU_INTERRUPT_TGT_INT_3 enum { @@ -613,6 +614,11 @@ typedef struct { #define NB_MMU_MODES 2 +typedef enum TPRAccess { + TPR_ACCESS_READ, + TPR_ACCESS_WRITE, +} TPRAccess; + typedef struct CPUX86State { /* standard registers */ target_ulong regs[CPU_NB_REGS]; @@ -772,6 +778,8 @@ typedef struct CPUX86State { XMMReg ymmh_regs[CPU_NB_REGS]; uint64_t xcr0; + + TPRAccess tpr_access_type; } CPUX86State; CPUX86State *cpu_x86_init(const char *cpu_model); @@ -1064,4 +1072,6 @@ void svm_check_intercept(CPUState *env1, uint32_t type); uint32_t cpu_cc_compute_all(CPUState *env1, int op); +void cpu_report_tpr_access(CPUState *env, TPRAccess access); + #endif /* CPU_I386_H */ diff --git a/target-i386/helper.c b/target-i386/helper.c index 2586aff70..d12c9621b 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -1189,6 +1189,22 @@ void cpu_x86_inject_mce(Monitor *mon, CPUState *cenv, int bank, } } } + +void cpu_report_tpr_access(CPUState *env, TPRAccess access) +{ + TranslationBlock *tb; + + if (kvm_enabled()) { + env->tpr_access_type = access; + + cpu_interrupt(env, CPU_INTERRUPT_TPR); + } else { + tb = tb_find_pc(env->mem_io_pc); + cpu_restore_state(tb, env, env->mem_io_pc); + + apic_handle_tpr_access_report(env->apic_state, env->eip, access); + } +} #endif /* !CONFIG_USER_ONLY */ static void mce_init(CPUX86State *cenv) diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 981192ddf..9a732078f 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -1635,8 +1635,10 @@ void kvm_arch_pre_run(CPUState *env, struct kvm_run *run) } if (!kvm_irqchip_in_kernel()) { - /* Force the VCPU out of its inner loop to process the INIT request */ - if (env->interrupt_request & CPU_INTERRUPT_INIT) { + /* Force the VCPU out of its inner loop to process any INIT requests + * or pending TPR access reports. */ + if (env->interrupt_request & + (CPU_INTERRUPT_INIT | CPU_INTERRUPT_TPR)) { env->exit_request = 1; } @@ -1730,6 +1732,12 @@ int kvm_arch_process_async_events(CPUState *env) kvm_cpu_synchronize_state(env); do_cpu_sipi(env); } + if (env->interrupt_request & CPU_INTERRUPT_TPR) { + env->interrupt_request &= ~CPU_INTERRUPT_TPR; + kvm_cpu_synchronize_state(env); + apic_handle_tpr_access_report(env->apic_state, env->eip, + env->tpr_access_type); + } return env->halted; } @@ -1746,6 +1754,16 @@ static int kvm_handle_halt(CPUState *env) return 0; } +static int kvm_handle_tpr_access(CPUState *env) +{ + struct kvm_run *run = env->kvm_run; + + apic_handle_tpr_access_report(env->apic_state, run->tpr_access.rip, + run->tpr_access.is_write ? TPR_ACCESS_WRITE + : TPR_ACCESS_READ); + return 1; +} + int kvm_arch_insert_sw_breakpoint(CPUState *env, struct kvm_sw_breakpoint *bp) { static const uint8_t int3 = 0xcc; @@ -1950,6 +1968,9 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) case KVM_EXIT_SET_TPR: ret = 0; break; + case KVM_EXIT_TPR_ACCESS: + ret = kvm_handle_tpr_access(env); + break; case KVM_EXIT_FAIL_ENTRY: code = run->fail_entry.hardware_entry_failure_reason; fprintf(stderr, "KVM: entry failed, hardware error 0x%" PRIx64 "\n", @@ -1987,6 +2008,7 @@ int kvm_arch_handle_exit(CPUState *env, struct kvm_run *run) bool kvm_arch_stop_on_emulation_error(CPUState *env) { + kvm_cpu_synchronize_state(env); return !(env->cr[0] & CR0_PE_MASK) || ((env->segs[R_CS].selector & 3) != 3); }