From df88b2d96e36d9a9e325bfcd12eb45671cbbc937 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Apr 2012 13:13:21 -0400 Subject: [PATCH 1/5] xen/enlighten: Disable MWAIT_LEAF so that acpi-pad won't be loaded. There are exactly four users of __monitor and __mwait: - cstate.c (which allows acpi_processor_ffh_cstate_enter to be called when the cpuidle API drivers are used. However patch "cpuidle: replace xen access to x86 pm_idle and default_idle" provides a mechanism to disable the cpuidle and use safe_halt. - smpboot (which allows mwait_play_dead to be called). However safe_halt is always used so we skip that. - intel_idle (same deal as above). - acpi_pad.c. This the one that we do not want to run as we will hit the below crash. Why do we want to expose MWAIT_LEAF in the first place? We want it for the xen-acpi-processor driver - which uploads C-states to the hypervisor. If MWAIT_LEAF is set, the cstate.c sets the proper address in the C-states so that the hypervisor can benefit from using the MWAIT functionality. And that is the sole reason for using it. Without this patch, if a module performs mwait or monitor we get this: invalid opcode: 0000 [#1] SMP CPU 2 .. snip.. Pid: 5036, comm: insmod Tainted: G O 3.4.0-rc2upstream-dirty #2 Intel Corporation S2600CP/S2600CP RIP: e030:[] [] mwait_check_init+0x17/0x1000 [mwait_check] RSP: e02b:ffff8801c298bf18 EFLAGS: 00010282 RAX: ffff8801c298a010 RBX: ffffffffa03b2000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff8801c29800d8 RDI: ffff8801ff097200 RBP: ffff8801c298bf18 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000000 R13: ffffffffa000a000 R14: 0000005148db7294 R15: 0000000000000003 FS: 00007fbb364f2700(0000) GS:ffff8801ff08c000(0000) knlGS:0000000000000000 CS: e033 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 000000000179f038 CR3: 00000001c9469000 CR4: 0000000000002660 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process insmod (pid: 5036, threadinfo ffff8801c298a000, task ffff8801c29cd7e0) Stack: ffff8801c298bf48 ffffffff81002124 ffffffffa03b2000 00000000000081fd 000000000178f010 000000000178f030 ffff8801c298bf78 ffffffff810c41e6 00007fff3fb30db9 00007fff3fb30db9 00000000000081fd 0000000000010000 Call Trace: [] do_one_initcall+0x124/0x170 [] sys_init_module+0xc6/0x220 [] system_call_fastpath+0x16/0x1b Code: <0f> 01 c8 31 c0 0f 01 c9 c9 c3 00 00 00 00 00 00 00 00 00 00 00 00 RIP [] mwait_check_init+0x17/0x1000 [mwait_check] RSP ---[ end trace 16582fc8a3d1e29a ]--- Kernel panic - not syncing: Fatal exception With this module (which is what acpi_pad.c would hit): MODULE_AUTHOR("Konrad Rzeszutek Wilk "); MODULE_DESCRIPTION("mwait_check_and_back"); MODULE_LICENSE("GPL"); MODULE_VERSION(); static int __init mwait_check_init(void) { __monitor((void *)¤t_thread_info()->flags, 0, 0); __mwait(0, 0); return 0; } static void __exit mwait_check_exit(void) { } module_init(mwait_check_init); module_exit(mwait_check_exit); Reported-by: Liu, Jinsong Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4f51bebac02..a8f8844b8d3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -261,7 +261,8 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, static bool __init xen_check_mwait(void) { -#ifdef CONFIG_ACPI +#if defined(CONFIG_ACPI) && !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR) && \ + !defined(CONFIG_ACPI_PROCESSOR_AGGREGATOR_MODULE) struct xen_platform_op op = { .cmd = XENPF_set_processor_pminfo, .u.set_pminfo.id = -1, @@ -349,7 +350,6 @@ static void __init xen_init_cpuid_mask(void) /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ if ((cx & xsave_mask) != xsave_mask) cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ - if (xen_check_mwait()) cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); } From 521394e4e679996955bc351cb6b64639751db2ff Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 25 Apr 2012 16:11:38 +0100 Subject: [PATCH 2/5] xen: use the pirq number to check the pirq_eoi_map In pirq_check_eoi_map use the pirq number rather than the Linux irq number to check whether an eoi is needed in the pirq_eoi_map. The reason is that the irq number is not always identical to the pirq number so if we wrongly use the irq number to check the pirq_eoi_map we are going to check for the wrong pirq to EOI. As a consequence some interrupts might not be EOI'ed by the guest correctly. Signed-off-by: Stefano Stabellini Tested-by: Tobias Geiger [v1: Added some extra wording to git commit] Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 4b33acd8ed4..0a8a17cd80b 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -274,7 +274,7 @@ static unsigned int cpu_from_evtchn(unsigned int evtchn) static bool pirq_check_eoi_map(unsigned irq) { - return test_bit(irq, pirq_eoi_map); + return test_bit(pirq_from_irq(irq), pirq_eoi_map); } static bool pirq_needs_eoi_flag(unsigned irq) From cf405ae612b0f7e2358db7ff594c0e94846137aa Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Apr 2012 13:50:03 -0400 Subject: [PATCH 3/5] xen/smp: Fix crash when booting with ACPI hotplug CPUs. When we boot on a machine that can hotplug CPUs and we are using 'dom0_max_vcpus=X' on the Xen hypervisor line to clip the amount of CPUs available to the initial domain, we get this: (XEN) Command line: com1=115200,8n1 dom0_mem=8G noreboot dom0_max_vcpus=8 sync_console mce_verbosity=verbose console=com1,vga loglvl=all guest_loglvl=all .. snip.. DMI: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.99.99.x032.072520111118 07/25/2011 .. snip. SMP: Allowing 64 CPUs, 32 hotplug CPUs installing Xen timer for CPU 7 cpu 7 spinlock event irq 361 NMI watchdog: disabled (cpu7): hardware events not enabled Brought up 8 CPUs .. snip.. [acpi processor finds the CPUs are not initialized and starts calling arch_register_cpu, which creates /sys/devices/system/cpu/cpu8/online] CPU 8 got hotplugged CPU 9 got hotplugged CPU 10 got hotplugged .. snip.. initcall 1_acpi_battery_init_async+0x0/0x1b returned 0 after 406 usecs calling erst_init+0x0/0x2bb @ 1 [and the scheduler sticks newly started tasks on the new CPUs, but said CPUs cannot be initialized b/c the hypervisor has limited the amount of vCPUS to 8 - as per the dom0_max_vcpus=8 flag. The spinlock tries to kick the other CPU, but the structure for that is not initialized and we crash.] BUG: unable to handle kernel paging request at fffffffffffffed8 IP: [] xen_spin_lock+0x29/0x60 PGD 180d067 PUD 180e067 PMD 0 Oops: 0002 [#1] SMP CPU 7 Modules linked in: Pid: 1, comm: swapper/0 Not tainted 3.4.0-rc2upstream-00001-gf5154e8 #1 Intel Corporation S2600CP/S2600CP RIP: e030:[] [] xen_spin_lock+0x29/0x60 RSP: e02b:ffff8801fb9b3a70 EFLAGS: 00010282 With this patch, we cap the amount of vCPUS that the initial domain can run, to exactly what dom0_max_vcpus=X has specified. In the future, if there is a hypercall that will allow a running domain to expand past its initial set of vCPUS, this patch should be re-evaluated. CC: stable@kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/smp.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 5fac6919b95..0503c0c493a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -178,6 +178,7 @@ static void __init xen_fill_possible_map(void) static void __init xen_filter_cpu_maps(void) { int i, rc; + unsigned int subtract = 0; if (!xen_initial_domain()) return; @@ -192,8 +193,22 @@ static void __init xen_filter_cpu_maps(void) } else { set_cpu_possible(i, false); set_cpu_present(i, false); + subtract++; } } +#ifdef CONFIG_HOTPLUG_CPU + /* This is akin to using 'nr_cpus' on the Linux command line. + * Which is OK as when we use 'dom0_max_vcpus=X' we can only + * have up to X, while nr_cpu_ids is greater than X. This + * normally is not a problem, except when CPU hotplugging + * is involved and then there might be more than X CPUs + * in the guest - which will not work as there is no + * hypercall to expand the max number of VCPUs an already + * running guest has. So cap it up to X. */ + if (subtract) + nr_cpu_ids = nr_cpu_ids - subtract; +#endif + } static void __init xen_smp_prepare_boot_cpu(void) From b930fe5e1f5646e071facda70b25b137ebeae5af Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Thu, 26 Apr 2012 14:22:33 -0400 Subject: [PATCH 4/5] xen/acpi: Workaround broken BIOSes exporting non-existing C-states. We did a similar check for the P-states but did not do it for the C-states. What we want to do is ignore cases where the DSDT has definition for sixteen CPUs, but the machine only has eight CPUs and we get: xen-acpi-processor: (CX): Hypervisor error (-22) for ACPI CPU14 Reported-by: Tobias Geiger Signed-off-by: Konrad Rzeszutek Wilk --- drivers/xen/xen-acpi-processor.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xen-acpi-processor.c b/drivers/xen/xen-acpi-processor.c index 174b5653cd8..0b48579a9cd 100644 --- a/drivers/xen/xen-acpi-processor.c +++ b/drivers/xen/xen-acpi-processor.c @@ -128,7 +128,10 @@ static int push_cxx_to_hypervisor(struct acpi_processor *_pr) pr_debug(" C%d: %s %d uS\n", cx->type, cx->desc, (u32)cx->latency); } - } else + } else if (ret != -EINVAL) + /* EINVAL means the ACPI ID is incorrect - meaning the ACPI + * table is referencing a non-existing CPU - which can happen + * with broken ACPI tables. */ pr_err(DRV_NAME "(CX): Hypervisor error (%d) for ACPI CPU%u\n", ret, _pr->acpi_id); From 7eb7ce4d2e8991aff4ecb71a81949a907ca755ac Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Thu, 26 Apr 2012 19:44:06 +0100 Subject: [PATCH 5/5] xen: correctly check for pending events when restoring irq flags In xen_restore_fl_direct(), xen_force_evtchn_callback() was being called even if no events were pending. This resulted in (depending on workload) about a 100 times as many xen_version hypercalls as necessary. Fix this by correcting the sense of the conditional jump. This seems to give a significant performance benefit for some workloads. There is some subtle tricksy "..since the check here is trying to check both pending and masked in a single cmpw, but I think this is correct. It will call check_events now only when the combined mask+pending word is 0x0001 (aka unmasked, pending)." (Ian) CC: stable@kernel.org Acked-by: Ian Campbell Signed-off-by: David Vrabel Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/xen/xen-asm.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S index 79d7362ad6d..3e45aa00071 100644 --- a/arch/x86/xen/xen-asm.S +++ b/arch/x86/xen/xen-asm.S @@ -96,7 +96,7 @@ ENTRY(xen_restore_fl_direct) /* check for unmasked and pending */ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info) + XEN_vcpu_info_pending - jz 1f + jnz 1f 2: call check_events 1: ENDPATCH(xen_restore_fl_direct)